grafos_tensor/
lib.rs

1//! grafos-tensor -- Tensor operations backed by leased fabric memory.
2//!
3//! This crate provides [`FabricTensor`], an N-dimensional tensor whose data is
4//! stored as contiguous row-major `f32` values backed by fabric memory acquired
5//! through FBMU (Fabric Bootstrap Memory Unit) leases. Every constructor
6//! validates that the fabric has sufficient capacity before allocating, and
7//! **holds the lease for the lifetime of the tensor**.
8//!
9//! # Supported operations
10//!
11//! | Operation | Method | Constraints |
12//! |-----------|--------|-------------|
13//! | Matrix multiply | [`FabricTensor::matmul`] | Both 2-D; inner dims match |
14//! | Elementwise add | [`FabricTensor::add`] | Same shape |
15//! | Elementwise mul | [`FabricTensor::mul`] | Same shape |
16//! | Scalar multiply | [`FabricTensor::scale`] | Any shape |
17//! | ReLU | [`FabricTensor::relu`] | Any shape |
18//! | Softmax | [`FabricTensor::softmax`] | `axis < ndim` |
19//! | Subtract | [`FabricTensor::subtract`] | Same shape |
20//! | Sum axis | [`FabricTensor::sum_axis`] | `axis < ndim` |
21//! | Sigmoid | [`FabricTensor::sigmoid`] | Any shape |
22//! | Natural log | [`FabricTensor::ln`] | Any shape |
23//! | Clamp | [`FabricTensor::clip`] | Any shape |
24//! | Transpose | [`FabricTensor::transpose`] | ndim >= 2 |
25//! | Reshape | [`FabricTensor::reshape`] | Same total elements |
26//!
27//! Placement helpers:
28//! - [`FabricTensor::to_gpu`] / [`FabricTensor::to_cpu`]
29//! - [`FabricTensor::device`], [`FabricTensor::is_cpu`], [`FabricTensor::is_gpu`]
30//!
31//! With `gpu` feature enabled, operations on GPU-placed tensors dispatch
32//! through the v1 `fabricbios_gpu_v1` session surface — specifically a
33//! private `submit_signal_kernel` helper that opens a transient
34//! [`grafos_std::gpu::GpuSession`] on the tensor's held `GpuLease`,
35//! `module_load`s the kernel binary, `launch`es with `[1,1,1]` grid/block
36//! dims, `sync`s, and lets the session/module drop (RAII). Numerical
37//! outputs are still computed on the CPU path; GPU dispatch is
38//! control-path signaling only under the current mock kernels. See
39//! `docs/grafos-tensor-guide.md` for the full programming model.
40//!
41//! Operator overloading is provided for `&FabricTensor`: `+` (elementwise add),
42//! `*` (elementwise mul), and `* f32` (scalar mul).
43//!
44//! # Quick start
45//!
46//! ```rust
47//! use grafos_tensor::FabricTensor;
48//!
49//! # grafos_std::host::reset_mock();
50//! # grafos_std::host::mock_set_fbmu_arena_size(65536);
51//! let a = FabricTensor::from_slice(&[2, 3], &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
52//! let b = FabricTensor::from_slice(&[3, 2], &[7.0, 8.0, 9.0, 10.0, 11.0, 12.0]).unwrap();
53//! let c = a.matmul(&b).unwrap();
54//! assert_eq!(c.shape(), &[2, 2]);
55//! assert_eq!(c.get(&[0, 0]).unwrap(), 58.0);
56//! ```
57//!
58//! # Operator overloading
59//!
60//! ```rust
61//! use grafos_tensor::FabricTensor;
62//!
63//! # grafos_std::host::reset_mock();
64//! # grafos_std::host::mock_set_fbmu_arena_size(65536);
65//! let a = FabricTensor::from_slice(&[3], &[1.0, 2.0, 3.0]).unwrap();
66//! let b = FabricTensor::from_slice(&[3], &[4.0, 5.0, 6.0]).unwrap();
67//!
68//! let sum = (&a + &b).unwrap();        // elementwise add
69//! let prod = (&a * &b).unwrap();       // elementwise mul
70//! let scaled = (&a * 2.0f32).unwrap(); // scalar mul
71//! ```
72//!
73//! # Testing
74//!
75//! On native targets, initialize the mock FBMU backend before creating tensors:
76//!
77//! ```rust
78//! grafos_std::host::reset_mock();
79//! grafos_std::host::mock_set_fbmu_arena_size(1 << 20); // 1 MiB
80//! ```
81
82#![cfg_attr(not(feature = "std"), no_std)]
83
84extern crate alloc;
85
86#[cfg(feature = "gpu")]
87use alloc::borrow::Cow;
88use alloc::vec;
89use alloc::vec::Vec;
90use core::ops::{Add, Mul, Sub};
91
92use grafos_std::error::FabricError;
93#[cfg(feature = "gpu")]
94use grafos_std::gpu::{GpuBuilder, GpuLease, GpuSession};
95use grafos_std::mem::{MemBuilder, MemLease};
96#[cfg(all(feature = "gpu", feature = "std"))]
97use std::{env, fs, path::PathBuf};
98
99/// Result alias using [`FabricError`].
100pub type Result<T> = core::result::Result<T, FabricError>;
101
102/// N-dimensional tensor backed by a fabric memory lease.
103///
104/// Tensor data is stored as a contiguous row-major `f32` array. Each
105/// constructor acquires a [`MemLease`] to validate
106/// that the fabric has sufficient capacity. The [`Shape`] metadata tracks
107/// dimensions and precomputed strides for multi-dimensional indexing.
108///
109/// # Storage layout
110///
111/// Data is laid out in row-major (C) order: the last dimension varies fastest
112/// in memory. For a `[2, 3]` matrix `[[a, b, c], [d, e, f]]`, the flat
113/// representation is `[a, b, c, d, e, f]` with strides `[3, 1]`.
114///
115/// # Examples
116///
117/// ```rust
118/// use grafos_tensor::FabricTensor;
119///
120/// # grafos_std::host::reset_mock();
121/// # grafos_std::host::mock_set_fbmu_arena_size(65536);
122/// // Create a 2x3 matrix and access elements
123/// let t = FabricTensor::from_slice(&[2, 3], &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
124/// assert_eq!(t.get(&[1, 2]).unwrap(), 6.0);
125/// assert_eq!(t.shape(), &[2, 3]);
126/// assert_eq!(t.strides(), &[3, 1]);
127/// ```
128pub struct FabricTensor {
129    storage: TensorStorage,
130    shape: Shape,
131    data: Vec<f32>,
132}
133
134/// Placement of tensor data in the fabric.
135#[derive(Debug, Clone, Copy, PartialEq, Eq)]
136pub enum Device {
137    /// Tensor is placed on CPU memory (FBMU-backed lease).
138    Cpu,
139    /// Tensor is placed on GPU memory (GPU lease id).
140    Gpu(u128),
141}
142
143enum TensorStorage {
144    Cpu(MemLease),
145    #[cfg(feature = "gpu")]
146    Gpu {
147        gpu_lease: GpuLease,
148        staging_lease: MemLease,
149    },
150}
151
152/// Shape metadata for an N-dimensional tensor.
153///
154/// Stores dimension sizes and precomputed row-major strides. Strides follow
155/// the standard row-major formula: `strides[n-1] = 1` and
156/// `strides[i] = strides[i+1] * dims[i+1]` for preceding dimensions.
157///
158/// # Examples
159///
160/// ```rust
161/// use grafos_tensor::Shape;
162///
163/// let s = Shape::new(&[2, 3, 4]);
164/// assert_eq!(s.dims(), &[2, 3, 4]);
165/// assert_eq!(s.strides(), &[12, 4, 1]);
166/// assert_eq!(s.numel(), 24);
167/// assert_eq!(s.ndim(), 3);
168/// ```
169#[derive(Debug, Clone, PartialEq, Eq)]
170pub struct Shape {
171    dims: Vec<usize>,
172    strides: Vec<usize>,
173}
174
175impl Shape {
176    /// Create a new shape from dimension sizes.
177    ///
178    /// Strides are computed in row-major (C) order: the last dimension
179    /// has stride 1, and each preceding dimension's stride is the product
180    /// of all following dimension sizes.
181    ///
182    /// An empty `dims` slice creates a scalar (0-D) shape with `numel() == 1`.
183    ///
184    /// # Examples
185    ///
186    /// ```rust
187    /// use grafos_tensor::Shape;
188    ///
189    /// let s = Shape::new(&[2, 3]);
190    /// assert_eq!(s.strides(), &[3, 1]);
191    ///
192    /// let scalar = Shape::new(&[]);
193    /// assert_eq!(scalar.numel(), 1);
194    /// ```
195    pub fn new(dims: &[usize]) -> Self {
196        let strides = Self::compute_strides(dims);
197        Shape {
198            dims: dims.to_vec(),
199            strides,
200        }
201    }
202
203    fn compute_strides(dims: &[usize]) -> Vec<usize> {
204        if dims.is_empty() {
205            return Vec::new();
206        }
207        let mut strides = vec![1usize; dims.len()];
208        for i in (0..dims.len() - 1).rev() {
209            strides[i] = strides[i + 1] * dims[i + 1];
210        }
211        strides
212    }
213
214    /// Total number of elements in the tensor.
215    pub fn numel(&self) -> usize {
216        self.dims.iter().product()
217    }
218
219    /// Number of dimensions.
220    pub fn ndim(&self) -> usize {
221        self.dims.len()
222    }
223
224    /// Dimension sizes.
225    pub fn dims(&self) -> &[usize] {
226        &self.dims
227    }
228
229    /// Row-major strides.
230    pub fn strides(&self) -> &[usize] {
231        &self.strides
232    }
233
234    /// Convert multi-dimensional indices to a flat offset.
235    ///
236    /// Returns `None` if any index is out of bounds.
237    fn flat_index(&self, indices: &[usize]) -> Option<usize> {
238        if indices.len() != self.dims.len() {
239            return None;
240        }
241        let mut offset = 0;
242        for (i, &idx) in indices.iter().enumerate() {
243            if idx >= self.dims[i] {
244                return None;
245            }
246            offset += idx * self.strides[i];
247        }
248        Some(offset)
249    }
250}
251
252/// Execute a single signal-only GPU kernel dispatch via the v1
253/// `GpuSession` surface.
254///
255/// grafos-tensor's GPU dispatch is control-path signaling only: kernel
256/// binaries default to the `b"grafos.tensor.mock"` stub and numerical
257/// results are computed by the CPU path (see
258/// `docs/grafos-tensor-guide.md`). This helper encapsulates the canonical
259/// v1 sequence — `module_load` → `launch` → `sync` — with the
260/// `GpuModule` and `GpuSession` dropping at function exit (RAII unload +
261/// release of the borrowed lease).
262///
263/// Grid and block default to `[1, 1, 1]` because the mock kernel does
264/// not read them. Real kernels would require caller-provided launch
265/// geometry, which is deferred until grafos-tensor ships real kernels.
266#[cfg(feature = "gpu")]
267fn submit_signal_kernel(
268    gpu_lease: &GpuLease,
269    op_name: &str,
270    binary: &[u8],
271    args: &[u8],
272    arg_sizes: &[u32],
273) -> Result<()> {
274    let mut sess = GpuSession::new(gpu_lease);
275    let module = sess.module_load(binary)?;
276    sess.launch(&module, op_name, [1, 1, 1], [1, 1, 1], args, arg_sizes)?;
277    sess.sync()?;
278    Ok(())
279}
280
281impl FabricTensor {
282    fn cpu_lease(&self) -> &MemLease {
283        match &self.storage {
284            TensorStorage::Cpu(lease) => lease,
285            #[cfg(feature = "gpu")]
286            TensorStorage::Gpu { staging_lease, .. } => staging_lease,
287        }
288    }
289
290    #[cfg(feature = "gpu")]
291    fn gpu_lease(&self) -> Option<&GpuLease> {
292        match &self.storage {
293            TensorStorage::Cpu(_) => None,
294            TensorStorage::Gpu { gpu_lease, .. } => Some(gpu_lease),
295        }
296    }
297
298    /// Access the underlying fabric memory lease held by this tensor.
299    ///
300    /// The current implementation stores element data in a local `Vec<f32>`,
301    /// but holds a [`MemLease`] as the capacity/liveness contract for the
302    /// tensor's lifetime.
303    pub fn lease(&self) -> &MemLease {
304        self.cpu_lease()
305    }
306
307    /// Wrap an existing memory lease as a tensor.
308    ///
309    /// The tensor takes ownership of `lease` and uses it as its backing
310    /// memory contract. `data` is initialized to zeros.
311    pub fn from_mem_lease(shape: &[usize], lease: MemLease) -> Self {
312        let shape = Shape::new(shape);
313        let numel = shape.numel();
314        FabricTensor {
315            storage: TensorStorage::Cpu(lease),
316            shape,
317            data: vec![0.0; numel],
318        }
319    }
320
321    /// Create a tensor filled with zeros.
322    ///
323    /// Acquires a [`MemLease`] large enough for
324    /// `numel * 4` bytes and initializes all elements to `0.0`.
325    ///
326    /// # Errors
327    ///
328    /// Returns [`FabricError::CapacityExceeded`] if the fabric cannot provide
329    /// sufficient memory, or [`FabricError::Disconnected`] if the FBMU
330    /// handshake fails.
331    ///
332    /// # Examples
333    ///
334    /// ```rust
335    /// use grafos_tensor::FabricTensor;
336    ///
337    /// # grafos_std::host::reset_mock();
338    /// # grafos_std::host::mock_set_fbmu_arena_size(65536);
339    /// let t = FabricTensor::zeros(&[3, 4]).unwrap();
340    /// assert_eq!(t.shape(), &[3, 4]);
341    /// assert_eq!(t.numel(), 12);
342    /// assert_eq!(t.get(&[0, 0]).unwrap(), 0.0);
343    /// ```
344    pub fn zeros(shape: &[usize]) -> Result<Self> {
345        let s = Shape::new(shape);
346        let numel = s.numel();
347        let byte_size = numel * core::mem::size_of::<f32>();
348        // Acquire and hold a lease to prove (and reserve) fabric capacity for
349        // the lifetime of this tensor. The current implementation stores the
350        // actual element data locally, but the lease is still the liveness and
351        // capacity contract.
352        let lease = MemBuilder::new().min_bytes(byte_size as u64).acquire()?;
353        Ok(FabricTensor {
354            storage: TensorStorage::Cpu(lease),
355            shape: s,
356            data: vec![0.0f32; numel],
357        })
358    }
359
360    /// Create a tensor from a slice of data.
361    ///
362    /// The length of `data` must exactly equal the product of `shape`
363    /// dimensions. Data is copied into the tensor in row-major order.
364    ///
365    /// # Errors
366    ///
367    /// Returns [`FabricError::CapacityExceeded`] if `data.len()` does not
368    /// match the shape's element count, or if the fabric cannot provide
369    /// sufficient memory.
370    ///
371    /// # Examples
372    ///
373    /// ```rust
374    /// use grafos_tensor::FabricTensor;
375    ///
376    /// # grafos_std::host::reset_mock();
377    /// # grafos_std::host::mock_set_fbmu_arena_size(65536);
378    /// let t = FabricTensor::from_slice(&[2, 2], &[1.0, 2.0, 3.0, 4.0]).unwrap();
379    /// assert_eq!(t.get(&[0, 1]).unwrap(), 2.0);
380    /// assert_eq!(t.get(&[1, 0]).unwrap(), 3.0);
381    /// ```
382    pub fn from_slice(shape: &[usize], data: &[f32]) -> Result<Self> {
383        let s = Shape::new(shape);
384        if data.len() != s.numel() {
385            return Err(FabricError::CapacityExceeded);
386        }
387        let byte_size = std::mem::size_of_val(data);
388        let lease = MemBuilder::new().min_bytes(byte_size as u64).acquire()?;
389        Ok(FabricTensor {
390            storage: TensorStorage::Cpu(lease),
391            shape: s,
392            data: data.to_vec(),
393        })
394    }
395
396    /// Return the current placement device for this tensor.
397    pub fn device(&self) -> Device {
398        #[cfg(feature = "gpu")]
399        {
400            if let Some(gpu) = self.gpu_lease() {
401                return Device::Gpu(gpu.lease_id());
402            }
403        }
404        Device::Cpu
405    }
406
407    /// Returns `true` if the tensor is placed on CPU memory.
408    pub fn is_cpu(&self) -> bool {
409        matches!(self.device(), Device::Cpu)
410    }
411
412    /// Returns `true` if the tensor is placed on GPU memory.
413    pub fn is_gpu(&self) -> bool {
414        matches!(self.device(), Device::Gpu(_))
415    }
416
417    /// Move/copy this tensor to GPU placement.
418    ///
419    /// On builds without the `gpu` feature, returns [`FabricError::Unsupported`].
420    pub fn to_gpu(&self) -> Result<FabricTensor> {
421        #[cfg(feature = "gpu")]
422        {
423            if self.is_gpu() {
424                return FabricTensor::from_slice(self.shape(), self.as_slice()).and_then(
425                    |mut t| {
426                        let gpu = GpuBuilder::new()
427                            .min_vram((self.numel() * core::mem::size_of::<f32>()) as u64)
428                            .acquire()?;
429                        let staging = MemBuilder::new()
430                            .min_bytes((self.numel() * core::mem::size_of::<f32>()) as u64)
431                            .acquire()?;
432                        let args = self.encode_f32_le();
433                        let arg_sizes = [args.len() as u32];
434                        submit_signal_kernel(
435                            &gpu,
436                            "tensor_upload",
437                            b"grafos.tensor.mock",
438                            &args,
439                            &arg_sizes,
440                        )?;
441                        t.storage = TensorStorage::Gpu {
442                            gpu_lease: gpu,
443                            staging_lease: staging,
444                        };
445                        Ok(t)
446                    },
447                );
448            }
449
450            let bytes = (self.numel() * core::mem::size_of::<f32>()) as u64;
451            let gpu_lease = GpuBuilder::new().min_vram(bytes).acquire()?;
452            let staging_lease = MemBuilder::new().min_bytes(bytes).acquire()?;
453            let args = self.encode_f32_le();
454            let arg_sizes = [args.len() as u32];
455            submit_signal_kernel(
456                &gpu_lease,
457                "tensor_upload",
458                b"grafos.tensor.mock",
459                &args,
460                &arg_sizes,
461            )?;
462
463            return Ok(FabricTensor {
464                storage: TensorStorage::Gpu {
465                    gpu_lease,
466                    staging_lease,
467                },
468                shape: self.shape.clone(),
469                data: self.data.clone(),
470            });
471        }
472
473        #[cfg(not(feature = "gpu"))]
474        {
475            Err(FabricError::Unsupported)
476        }
477    }
478
479    /// Move/copy this tensor to CPU placement.
480    pub fn to_cpu(&self) -> Result<FabricTensor> {
481        if self.is_cpu() {
482            return FabricTensor::from_slice(self.shape(), self.as_slice());
483        }
484
485        #[cfg(feature = "gpu")]
486        {
487            if let Some(gpu) = self.gpu_lease() {
488                let args = self.encode_f32_le();
489                let arg_sizes = [args.len() as u32];
490                submit_signal_kernel(
491                    gpu,
492                    "tensor_download",
493                    b"grafos.tensor.mock",
494                    &args,
495                    &arg_sizes,
496                )?;
497            }
498        }
499        FabricTensor::from_slice(self.shape(), self.as_slice())
500    }
501
502    /// Get the element at the given multi-dimensional indices.
503    ///
504    /// The length of `indices` must equal [`ndim()`](Self::ndim), and each
505    /// index must be within its dimension's range.
506    ///
507    /// # Errors
508    ///
509    /// Returns [`FabricError::CapacityExceeded`] if any index is out of
510    /// bounds or if the wrong number of indices is provided.
511    ///
512    /// # Examples
513    ///
514    /// ```rust
515    /// use grafos_tensor::FabricTensor;
516    ///
517    /// # grafos_std::host::reset_mock();
518    /// # grafos_std::host::mock_set_fbmu_arena_size(65536);
519    /// let t = FabricTensor::from_slice(&[2, 3], &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
520    /// assert_eq!(t.get(&[0, 2]).unwrap(), 3.0);
521    /// assert_eq!(t.get(&[1, 1]).unwrap(), 5.0);
522    /// assert!(t.get(&[2, 0]).is_err()); // out of bounds
523    /// ```
524    pub fn get(&self, indices: &[usize]) -> Result<f32> {
525        let offset = self
526            .shape
527            .flat_index(indices)
528            .ok_or(FabricError::CapacityExceeded)?;
529        Ok(self.data[offset])
530    }
531
532    /// Set the element at the given multi-dimensional indices.
533    ///
534    /// # Errors
535    ///
536    /// Returns [`FabricError::CapacityExceeded`] if any index is out of
537    /// bounds or if the wrong number of indices is provided.
538    ///
539    /// # Examples
540    ///
541    /// ```rust
542    /// use grafos_tensor::FabricTensor;
543    ///
544    /// # grafos_std::host::reset_mock();
545    /// # grafos_std::host::mock_set_fbmu_arena_size(65536);
546    /// let mut t = FabricTensor::zeros(&[2, 2]).unwrap();
547    /// t.set(&[1, 0], 42.0).unwrap();
548    /// assert_eq!(t.get(&[1, 0]).unwrap(), 42.0);
549    /// ```
550    pub fn set(&mut self, indices: &[usize], value: f32) -> Result<()> {
551        let offset = self
552            .shape
553            .flat_index(indices)
554            .ok_or(FabricError::CapacityExceeded)?;
555        self.data[offset] = value;
556        Ok(())
557    }
558
559    /// Dimension sizes as a slice (e.g. `&[2, 3]` for a 2x3 matrix).
560    pub fn shape(&self) -> &[usize] {
561        self.shape.dims()
562    }
563
564    /// Number of dimensions (0 for scalar, 1 for vector, 2 for matrix, etc.).
565    pub fn ndim(&self) -> usize {
566        self.shape.ndim()
567    }
568
569    /// Total number of elements (product of all dimension sizes).
570    pub fn numel(&self) -> usize {
571        self.shape.numel()
572    }
573
574    /// Row-major strides (e.g. `&[3, 1]` for a 2x3 matrix).
575    pub fn strides(&self) -> &[usize] {
576        self.shape.strides()
577    }
578
579    /// Access raw data as a contiguous `f32` slice in row-major order.
580    pub fn as_slice(&self) -> &[f32] {
581        &self.data
582    }
583
584    #[cfg(feature = "gpu")]
585    fn encode_f32_le(&self) -> Vec<u8> {
586        let mut bytes = Vec::with_capacity(self.data.len() * core::mem::size_of::<f32>());
587        for value in &self.data {
588            bytes.extend_from_slice(&value.to_le_bytes());
589        }
590        bytes
591    }
592
593    #[cfg(feature = "gpu")]
594    fn push_u16_le(buf: &mut Vec<u8>, v: u16) {
595        buf.extend_from_slice(&v.to_le_bytes());
596    }
597
598    #[cfg(feature = "gpu")]
599    fn push_u32_le(buf: &mut Vec<u8>, v: u32) {
600        buf.extend_from_slice(&v.to_le_bytes());
601    }
602
603    #[cfg(feature = "gpu")]
604    fn push_u64_le(buf: &mut Vec<u8>, v: u64) {
605        buf.extend_from_slice(&v.to_le_bytes());
606    }
607
608    #[cfg(feature = "gpu")]
609    fn encode_tensor_descriptor(&self, out: &mut Vec<u8>) -> Result<()> {
610        let ndim: u16 = self
611            .ndim()
612            .try_into()
613            .map_err(|_| FabricError::CapacityExceeded)?;
614        Self::push_u16_le(out, ndim);
615
616        let numel: u64 = self
617            .numel()
618            .try_into()
619            .map_err(|_| FabricError::CapacityExceeded)?;
620        Self::push_u64_le(out, numel);
621        Self::push_u64_le(out, 0); // offset
622
623        for &dim in self.shape() {
624            let dim_u32: u32 = dim.try_into().map_err(|_| FabricError::CapacityExceeded)?;
625            Self::push_u32_le(out, dim_u32);
626        }
627        for &stride in self.strides() {
628            let stride_u32: u32 = stride
629                .try_into()
630                .map_err(|_| FabricError::CapacityExceeded)?;
631            Self::push_u32_le(out, stride_u32);
632        }
633        Ok(())
634    }
635
636    #[cfg(feature = "gpu")]
637    fn kernel_binary_for(op_name: &str) -> Cow<'static, [u8]> {
638        #[cfg(all(feature = "gpu", feature = "std"))]
639        {
640            if let Some(file) = env::var_os("GRAFOS_TENSOR_HSACO") {
641                let path = PathBuf::from(file);
642                if let Ok(binary) = fs::read(path) {
643                    if !binary.is_empty() {
644                        return Cow::Owned(binary);
645                    }
646                }
647            }
648
649            if let Some(dir) = env::var_os("GRAFOS_TENSOR_KERNEL_DIR") {
650                let base = PathBuf::from(dir);
651                let candidates = [
652                    base.join(format!("{op_name}.hsaco")),
653                    base.join("tensor_ops_gfx1032.hsaco"),
654                    base.join("tensor_ops.hsaco"),
655                ];
656                for candidate in candidates {
657                    if let Ok(binary) = fs::read(&candidate) {
658                        if !binary.is_empty() {
659                            return Cow::Owned(binary);
660                        }
661                    }
662                }
663            }
664        }
665        Cow::Borrowed(b"grafos.tensor.mock")
666    }
667
668    #[cfg(feature = "gpu")]
669    fn pack_gpu_unary_args(&self, op_name: &str) -> Result<Vec<u8>> {
670        let mut args = Vec::new();
671        args.extend_from_slice(b"GTA0");
672        args.push(1); // version
673        args.push(1); // tensor count
674        Self::push_u16_le(&mut args, op_name.len() as u16);
675        args.extend_from_slice(op_name.as_bytes());
676        self.encode_tensor_descriptor(&mut args)?;
677        let data = self.encode_f32_le();
678        Self::push_u64_le(&mut args, data.len() as u64);
679        args.extend_from_slice(&data);
680        Ok(args)
681    }
682
683    #[cfg(feature = "gpu")]
684    fn pack_gpu_binary_args(&self, other: &FabricTensor, op_name: &str) -> Result<Vec<u8>> {
685        let mut args = Vec::new();
686        args.extend_from_slice(b"GTA0");
687        args.push(1); // version
688        args.push(2); // tensor count
689        Self::push_u16_le(&mut args, op_name.len() as u16);
690        args.extend_from_slice(op_name.as_bytes());
691
692        self.encode_tensor_descriptor(&mut args)?;
693        let lhs = self.encode_f32_le();
694        Self::push_u64_le(&mut args, lhs.len() as u64);
695        args.extend_from_slice(&lhs);
696
697        other.encode_tensor_descriptor(&mut args)?;
698        let rhs = other.encode_f32_le();
699        Self::push_u64_le(&mut args, rhs.len() as u64);
700        args.extend_from_slice(&rhs);
701        Ok(args)
702    }
703
704    #[cfg(feature = "gpu")]
705    fn dispatch_gpu_unary(&self, op_name: &str) -> Result<()> {
706        let Some(gpu) = self.gpu_lease() else {
707            return Ok(());
708        };
709        let args = self.pack_gpu_unary_args(op_name)?;
710        let binary = Self::kernel_binary_for(op_name);
711        let arg_sizes = [args.len() as u32];
712        submit_signal_kernel(gpu, op_name, binary.as_ref(), &args, &arg_sizes)?;
713        Ok(())
714    }
715
716    #[cfg(feature = "gpu")]
717    fn dispatch_gpu_binary(&self, other: &FabricTensor, op_name: &str) -> Result<()> {
718        let Some(gpu) = self.gpu_lease() else {
719            return Ok(());
720        };
721        let args = self.pack_gpu_binary_args(other, op_name)?;
722        let binary = Self::kernel_binary_for(op_name);
723        let arg_sizes = [args.len() as u32];
724        submit_signal_kernel(gpu, op_name, binary.as_ref(), &args, &arg_sizes)?;
725        Ok(())
726    }
727
728    fn cpu_matmul(&self, other: &FabricTensor) -> Result<FabricTensor> {
729        if self.ndim() != 2 || other.ndim() != 2 {
730            return Err(FabricError::CapacityExceeded);
731        }
732        let m = self.shape.dims[0];
733        let k = self.shape.dims[1];
734        let k2 = other.shape.dims[0];
735        let n = other.shape.dims[1];
736        if k != k2 {
737            return Err(FabricError::CapacityExceeded);
738        }
739        let mut result_data = vec![0.0f32; m * n];
740        for i in 0..m {
741            for j in 0..n {
742                let mut sum = 0.0f32;
743                for p in 0..k {
744                    sum += self.data[i * k + p] * other.data[p * n + j];
745                }
746                result_data[i * n + j] = sum;
747            }
748        }
749        FabricTensor::from_slice(&[m, n], &result_data)
750    }
751
752    fn cpu_add(&self, other: &FabricTensor) -> Result<FabricTensor> {
753        if self.shape.dims != other.shape.dims {
754            return Err(FabricError::CapacityExceeded);
755        }
756        let data: Vec<f32> = self
757            .data
758            .iter()
759            .zip(other.data.iter())
760            .map(|(a, b)| a + b)
761            .collect();
762        FabricTensor::from_slice(self.shape.dims(), &data)
763    }
764
765    fn cpu_mul(&self, other: &FabricTensor) -> Result<FabricTensor> {
766        if self.shape.dims != other.shape.dims {
767            return Err(FabricError::CapacityExceeded);
768        }
769        let data: Vec<f32> = self
770            .data
771            .iter()
772            .zip(other.data.iter())
773            .map(|(a, b)| a * b)
774            .collect();
775        FabricTensor::from_slice(self.shape.dims(), &data)
776    }
777
778    /// Matrix multiplication (2-D only).
779    ///
780    /// Computes `C = self @ other` using a triple-loop algorithm. Both tensors
781    /// must be 2-D. If `self` has shape `[M, K]` and `other` has shape
782    /// `[K, N]`, the result has shape `[M, N]`.
783    ///
784    /// # Errors
785    ///
786    /// Returns [`FabricError::CapacityExceeded`] if either tensor is not 2-D,
787    /// if the inner dimensions don't match, or if the fabric cannot provide
788    /// memory for the result.
789    ///
790    /// # Examples
791    ///
792    /// ```rust
793    /// use grafos_tensor::FabricTensor;
794    ///
795    /// # grafos_std::host::reset_mock();
796    /// # grafos_std::host::mock_set_fbmu_arena_size(65536);
797    /// let a = FabricTensor::from_slice(&[2, 3], &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
798    /// let b = FabricTensor::from_slice(&[3, 2], &[7.0, 8.0, 9.0, 10.0, 11.0, 12.0]).unwrap();
799    /// let c = a.matmul(&b).unwrap();
800    /// assert_eq!(c.shape(), &[2, 2]);
801    /// // C[0,0] = 1*7 + 2*9 + 3*11 = 58
802    /// assert_eq!(c.get(&[0, 0]).unwrap(), 58.0);
803    /// ```
804    pub fn matmul(&self, other: &FabricTensor) -> Result<FabricTensor> {
805        #[cfg(feature = "gpu")]
806        if self.is_gpu() && other.is_gpu() {
807            self.dispatch_gpu_binary(other, "tensor_matmul")?;
808            let cpu_result = self.cpu_matmul(other)?;
809            return cpu_result.to_gpu();
810        }
811        self.cpu_matmul(other)
812    }
813
814    /// Elementwise addition.
815    ///
816    /// Both tensors must have the same shape. Returns a new tensor where
817    /// each element is the sum of the corresponding elements.
818    ///
819    /// Also available via the `+` operator: `(&a + &b).unwrap()`.
820    ///
821    /// # Errors
822    ///
823    /// Returns [`FabricError::CapacityExceeded`] if shapes don't match.
824    ///
825    /// # Examples
826    ///
827    /// ```rust
828    /// use grafos_tensor::FabricTensor;
829    ///
830    /// # grafos_std::host::reset_mock();
831    /// # grafos_std::host::mock_set_fbmu_arena_size(65536);
832    /// let a = FabricTensor::from_slice(&[2, 2], &[1.0, 2.0, 3.0, 4.0]).unwrap();
833    /// let b = FabricTensor::from_slice(&[2, 2], &[10.0, 20.0, 30.0, 40.0]).unwrap();
834    /// let c = a.add(&b).unwrap();
835    /// assert_eq!(c.as_slice(), &[11.0, 22.0, 33.0, 44.0]);
836    /// ```
837    pub fn add(&self, other: &FabricTensor) -> Result<FabricTensor> {
838        #[cfg(feature = "gpu")]
839        if self.is_gpu() && other.is_gpu() {
840            self.dispatch_gpu_binary(other, "tensor_add")?;
841            let cpu_result = self.cpu_add(other)?;
842            return cpu_result.to_gpu();
843        }
844        self.cpu_add(other)
845    }
846
847    /// Elementwise multiplication (Hadamard product).
848    ///
849    /// Both tensors must have the same shape. Returns a new tensor where
850    /// each element is the product of the corresponding elements.
851    ///
852    /// Also available via the `*` operator: `(&a * &b).unwrap()`.
853    ///
854    /// # Errors
855    ///
856    /// Returns [`FabricError::CapacityExceeded`] if shapes don't match.
857    ///
858    /// # Examples
859    ///
860    /// ```rust
861    /// use grafos_tensor::FabricTensor;
862    ///
863    /// # grafos_std::host::reset_mock();
864    /// # grafos_std::host::mock_set_fbmu_arena_size(65536);
865    /// let a = FabricTensor::from_slice(&[3], &[2.0, 3.0, 4.0]).unwrap();
866    /// let b = FabricTensor::from_slice(&[3], &[5.0, 6.0, 7.0]).unwrap();
867    /// let c = a.mul(&b).unwrap();
868    /// assert_eq!(c.as_slice(), &[10.0, 18.0, 28.0]);
869    /// ```
870    pub fn mul(&self, other: &FabricTensor) -> Result<FabricTensor> {
871        #[cfg(feature = "gpu")]
872        if self.is_gpu() && other.is_gpu() {
873            self.dispatch_gpu_binary(other, "tensor_mul")?;
874            let cpu_result = self.cpu_mul(other)?;
875            return cpu_result.to_gpu();
876        }
877        self.cpu_mul(other)
878    }
879
880    /// Scalar multiplication.
881    ///
882    /// Multiplies every element by `scalar`. Works on tensors of any shape.
883    ///
884    /// Also available via the `*` operator with `f32`: `(&a * 2.0f32).unwrap()`.
885    ///
886    /// # Examples
887    ///
888    /// ```rust
889    /// use grafos_tensor::FabricTensor;
890    ///
891    /// # grafos_std::host::reset_mock();
892    /// # grafos_std::host::mock_set_fbmu_arena_size(65536);
893    /// let a = FabricTensor::from_slice(&[2, 2], &[1.0, 2.0, 3.0, 4.0]).unwrap();
894    /// let b = a.scale(3.0).unwrap();
895    /// assert_eq!(b.as_slice(), &[3.0, 6.0, 9.0, 12.0]);
896    /// ```
897    pub fn scale(&self, scalar: f32) -> Result<FabricTensor> {
898        #[cfg(feature = "gpu")]
899        if self.is_gpu() {
900            self.dispatch_gpu_unary("tensor_scale")?;
901            let data: Vec<f32> = self.data.iter().map(|&x| x * scalar).collect();
902            let cpu_result = FabricTensor::from_slice(self.shape.dims(), &data)?;
903            return cpu_result.to_gpu();
904        }
905        let data: Vec<f32> = self.data.iter().map(|&x| x * scalar).collect();
906        FabricTensor::from_slice(self.shape.dims(), &data)
907    }
908
909    /// ReLU activation: `max(0, x)` elementwise.
910    ///
911    /// Returns a new tensor where negative values are clamped to zero.
912    /// Commonly used as an activation function in neural network layers.
913    ///
914    /// # Examples
915    ///
916    /// ```rust
917    /// use grafos_tensor::FabricTensor;
918    ///
919    /// # grafos_std::host::reset_mock();
920    /// # grafos_std::host::mock_set_fbmu_arena_size(65536);
921    /// let a = FabricTensor::from_slice(&[4], &[-2.0, -0.5, 0.0, 3.0]).unwrap();
922    /// let b = a.relu().unwrap();
923    /// assert_eq!(b.as_slice(), &[0.0, 0.0, 0.0, 3.0]);
924    /// ```
925    pub fn relu(&self) -> Result<FabricTensor> {
926        #[cfg(feature = "gpu")]
927        if self.is_gpu() {
928            self.dispatch_gpu_unary("tensor_relu")?;
929            let data: Vec<f32> = self
930                .data
931                .iter()
932                .map(|&x| if x > 0.0 { x } else { 0.0 })
933                .collect();
934            let cpu_result = FabricTensor::from_slice(self.shape.dims(), &data)?;
935            return cpu_result.to_gpu();
936        }
937        let data: Vec<f32> = self
938            .data
939            .iter()
940            .map(|&x| if x > 0.0 { x } else { 0.0 })
941            .collect();
942        FabricTensor::from_slice(self.shape.dims(), &data)
943    }
944
945    /// Softmax along the specified axis.
946    ///
947    /// For each slice along `axis`, computes `exp(x - max) / sum(exp(x - max))`.
948    /// The max-subtraction provides numerical stability against overflow.
949    ///
950    /// For a `[M, N]` matrix: `softmax(1)` normalizes each row (sums to 1),
951    /// `softmax(0)` normalizes each column.
952    ///
953    /// # Errors
954    ///
955    /// Returns [`FabricError::CapacityExceeded`] if `axis >= ndim()`.
956    ///
957    /// # Examples
958    ///
959    /// ```rust
960    /// use grafos_tensor::FabricTensor;
961    ///
962    /// # grafos_std::host::reset_mock();
963    /// # grafos_std::host::mock_set_fbmu_arena_size(65536);
964    /// let a = FabricTensor::from_slice(&[1, 4], &[1.0, 2.0, 3.0, 4.0]).unwrap();
965    /// let probs = a.softmax(1).unwrap();
966    /// let sum: f32 = probs.as_slice().iter().sum();
967    /// assert!((sum - 1.0).abs() < 1e-6);
968    /// ```
969    pub fn softmax(&self, axis: usize) -> Result<FabricTensor> {
970        #[cfg(feature = "gpu")]
971        if self.is_gpu() {
972            self.dispatch_gpu_unary("tensor_softmax")?;
973            let cpu_result = self.softmax_cpu(axis)?;
974            return cpu_result.to_gpu();
975        }
976        self.softmax_cpu(axis)
977    }
978
979    /// Transpose: swap the last two dimensions.
980    ///
981    /// For a 2-D `[M, N]` matrix, produces `[N, M]`. For higher-rank tensors
982    /// (e.g. `[B, M, N]`), produces `[B, N, M]` by batching over leading
983    /// dimensions.
984    ///
985    /// # Errors
986    ///
987    /// Returns [`FabricError::CapacityExceeded`] if the tensor has fewer than
988    /// 2 dimensions.
989    ///
990    /// # Examples
991    ///
992    /// ```rust
993    /// use grafos_tensor::FabricTensor;
994    ///
995    /// # grafos_std::host::reset_mock();
996    /// # grafos_std::host::mock_set_fbmu_arena_size(65536);
997    /// let a = FabricTensor::from_slice(&[2, 3], &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
998    /// let b = a.transpose().unwrap();
999    /// assert_eq!(b.shape(), &[3, 2]);
1000    /// assert_eq!(b.get(&[0, 1]).unwrap(), 4.0); // was a[1, 0]
1001    /// ```
1002    pub fn transpose(&self) -> Result<FabricTensor> {
1003        #[cfg(feature = "gpu")]
1004        if self.is_gpu() {
1005            self.dispatch_gpu_unary("tensor_transpose")?;
1006            let cpu_result = self.transpose_cpu()?;
1007            return cpu_result.to_gpu();
1008        }
1009        self.transpose_cpu()
1010    }
1011
1012    /// Reshape to a new shape.
1013    ///
1014    /// The total number of elements must remain the same. Data is copied
1015    /// to a new tensor backed by a new lease; the underlying flat storage
1016    /// order does not change.
1017    ///
1018    /// # Errors
1019    ///
1020    /// Returns [`FabricError::CapacityExceeded`] if the new shape's element
1021    /// count differs from the current tensor's.
1022    ///
1023    /// # Examples
1024    ///
1025    /// ```rust
1026    /// use grafos_tensor::FabricTensor;
1027    ///
1028    /// # grafos_std::host::reset_mock();
1029    /// # grafos_std::host::mock_set_fbmu_arena_size(65536);
1030    /// let a = FabricTensor::from_slice(&[2, 3], &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
1031    /// let b = a.reshape(&[3, 2]).unwrap();
1032    /// assert_eq!(b.shape(), &[3, 2]);
1033    /// assert_eq!(b.as_slice(), a.as_slice()); // same data order
1034    /// ```
1035    pub fn reshape(&self, new_shape: &[usize]) -> Result<FabricTensor> {
1036        #[cfg(feature = "gpu")]
1037        if self.is_gpu() {
1038            self.dispatch_gpu_unary("tensor_reshape")?;
1039            let cpu_result = self.reshape_cpu(new_shape)?;
1040            return cpu_result.to_gpu();
1041        }
1042        self.reshape_cpu(new_shape)
1043    }
1044
1045    fn softmax_cpu(&self, axis: usize) -> Result<FabricTensor> {
1046        if axis >= self.ndim() {
1047            return Err(FabricError::CapacityExceeded);
1048        }
1049        let dims = &self.shape.dims;
1050        let strides = &self.shape.strides;
1051        let mut result = self.data.clone();
1052
1053        let axis_len = dims[axis];
1054        let axis_stride = strides[axis];
1055        let outer_size: usize = dims[..axis].iter().product();
1056        let inner_size: usize = dims[axis + 1..].iter().product();
1057
1058        for outer in 0..outer_size {
1059            for inner in 0..inner_size {
1060                let base = outer * (axis_len * inner_size) + inner;
1061                let mut max_val = f32::NEG_INFINITY;
1062                for a in 0..axis_len {
1063                    let idx = base + a * axis_stride;
1064                    if self.data[idx] > max_val {
1065                        max_val = self.data[idx];
1066                    }
1067                }
1068
1069                let mut sum = 0.0f32;
1070                for a in 0..axis_len {
1071                    let idx = base + a * axis_stride;
1072                    let exp_val = (self.data[idx] - max_val).exp();
1073                    result[idx] = exp_val;
1074                    sum += exp_val;
1075                }
1076
1077                for a in 0..axis_len {
1078                    let idx = base + a * axis_stride;
1079                    result[idx] /= sum;
1080                }
1081            }
1082        }
1083
1084        FabricTensor::from_slice(dims, &result)
1085    }
1086
1087    fn transpose_cpu(&self) -> Result<FabricTensor> {
1088        if self.ndim() < 2 {
1089            return Err(FabricError::CapacityExceeded);
1090        }
1091        let dims = &self.shape.dims;
1092        let ndim = dims.len();
1093
1094        let mut new_dims = dims.to_vec();
1095        new_dims.swap(ndim - 2, ndim - 1);
1096
1097        let rows = dims[ndim - 2];
1098        let cols = dims[ndim - 1];
1099        let batch_size: usize = dims[..ndim - 2].iter().product();
1100        let matrix_size = rows * cols;
1101
1102        let mut result = vec![0.0f32; self.data.len()];
1103        for b in 0..batch_size {
1104            let src_base = b * matrix_size;
1105            let dst_base = b * matrix_size;
1106            for r in 0..rows {
1107                for c in 0..cols {
1108                    result[dst_base + c * rows + r] = self.data[src_base + r * cols + c];
1109                }
1110            }
1111        }
1112
1113        FabricTensor::from_slice(&new_dims, &result)
1114    }
1115
1116    fn reshape_cpu(&self, new_shape: &[usize]) -> Result<FabricTensor> {
1117        let new_numel: usize = new_shape.iter().product();
1118        if new_numel != self.numel() {
1119            return Err(FabricError::CapacityExceeded);
1120        }
1121        FabricTensor::from_slice(new_shape, &self.data)
1122    }
1123
1124    /// Elementwise subtraction: `self - other`.
1125    ///
1126    /// Both tensors must have the same shape.
1127    ///
1128    /// # Examples
1129    ///
1130    /// ```rust
1131    /// use grafos_tensor::FabricTensor;
1132    ///
1133    /// # grafos_std::host::reset_mock();
1134    /// # grafos_std::host::mock_set_fbmu_arena_size(65536);
1135    /// let a = FabricTensor::from_slice(&[3], &[5.0, 3.0, 1.0]).unwrap();
1136    /// let b = FabricTensor::from_slice(&[3], &[1.0, 2.0, 3.0]).unwrap();
1137    /// let c = a.subtract(&b).unwrap();
1138    /// assert_eq!(c.as_slice(), &[4.0, 1.0, -2.0]);
1139    /// ```
1140    pub fn subtract(&self, other: &FabricTensor) -> Result<FabricTensor> {
1141        if self.shape() != other.shape() {
1142            return Err(FabricError::CapacityExceeded);
1143        }
1144        #[cfg(feature = "gpu")]
1145        if self.is_gpu() {
1146            let neg = other.scale(-1.0)?;
1147            return self.add(&neg);
1148        }
1149        let data: Vec<f32> = self
1150            .data
1151            .iter()
1152            .zip(other.data.iter())
1153            .map(|(&a, &b)| a - b)
1154            .collect();
1155        FabricTensor::from_slice(self.shape(), &data)
1156    }
1157
1158    /// Sum along the specified axis, reducing that dimension.
1159    ///
1160    /// For a `[M, N]` matrix: `sum_axis(0)` produces `[N]` (column sums),
1161    /// `sum_axis(1)` produces `[M]` (row sums).
1162    ///
1163    /// # Errors
1164    ///
1165    /// Returns [`FabricError::CapacityExceeded`] if `axis >= ndim()`.
1166    ///
1167    /// # Examples
1168    ///
1169    /// ```rust
1170    /// use grafos_tensor::FabricTensor;
1171    ///
1172    /// # grafos_std::host::reset_mock();
1173    /// # grafos_std::host::mock_set_fbmu_arena_size(65536);
1174    /// let a = FabricTensor::from_slice(&[2, 3], &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
1175    /// let row_sums = a.sum_axis(1).unwrap();
1176    /// assert_eq!(row_sums.shape(), &[2]);
1177    /// assert_eq!(row_sums.as_slice(), &[6.0, 15.0]);
1178    /// let col_sums = a.sum_axis(0).unwrap();
1179    /// assert_eq!(col_sums.shape(), &[3]);
1180    /// assert_eq!(col_sums.as_slice(), &[5.0, 7.0, 9.0]);
1181    /// ```
1182    pub fn sum_axis(&self, axis: usize) -> Result<FabricTensor> {
1183        if axis >= self.ndim() {
1184            return Err(FabricError::CapacityExceeded);
1185        }
1186        let dims = self.shape();
1187        let axis_len = dims[axis];
1188        let mut new_dims: Vec<usize> = dims
1189            .iter()
1190            .enumerate()
1191            .filter(|&(i, _)| i != axis)
1192            .map(|(_, &d)| d)
1193            .collect();
1194        if new_dims.is_empty() {
1195            new_dims.push(1);
1196        }
1197        let new_numel: usize = new_dims.iter().product();
1198        let mut result = vec![0.0f32; new_numel];
1199
1200        let outer_size: usize = dims[..axis].iter().product();
1201        let inner_size: usize = dims[axis + 1..].iter().product();
1202
1203        for outer in 0..outer_size {
1204            for inner in 0..inner_size {
1205                let mut sum = 0.0f32;
1206                for a in 0..axis_len {
1207                    let src_idx = outer * (axis_len * inner_size) + a * inner_size + inner;
1208                    sum += self.data[src_idx];
1209                }
1210                let dst_idx = outer * inner_size + inner;
1211                result[dst_idx] = sum;
1212            }
1213        }
1214
1215        FabricTensor::from_slice(&new_dims, &result)
1216    }
1217
1218    /// Elementwise sigmoid: `1 / (1 + exp(-x))`.
1219    ///
1220    /// # Examples
1221    ///
1222    /// ```rust
1223    /// use grafos_tensor::FabricTensor;
1224    ///
1225    /// # grafos_std::host::reset_mock();
1226    /// # grafos_std::host::mock_set_fbmu_arena_size(65536);
1227    /// let a = FabricTensor::from_slice(&[3], &[0.0, 100.0, -100.0]).unwrap();
1228    /// let b = a.sigmoid().unwrap();
1229    /// assert!((b.as_slice()[0] - 0.5).abs() < 1e-6);
1230    /// assert!((b.as_slice()[1] - 1.0).abs() < 1e-4);
1231    /// assert!(b.as_slice()[2] < 1e-4);
1232    /// ```
1233    pub fn sigmoid(&self) -> Result<FabricTensor> {
1234        #[cfg(feature = "gpu")]
1235        if self.is_gpu() {
1236            self.dispatch_gpu_unary("tensor_sigmoid")?;
1237            let cpu_result = self.sigmoid_cpu()?;
1238            return cpu_result.to_gpu();
1239        }
1240        self.sigmoid_cpu()
1241    }
1242
1243    fn sigmoid_cpu(&self) -> Result<FabricTensor> {
1244        let data: Vec<f32> = self
1245            .data
1246            .iter()
1247            .map(|&x| 1.0 / (1.0 + (-x).exp()))
1248            .collect();
1249        FabricTensor::from_slice(self.shape(), &data)
1250    }
1251
1252    /// Elementwise natural logarithm.
1253    ///
1254    /// # Examples
1255    ///
1256    /// ```rust
1257    /// use grafos_tensor::FabricTensor;
1258    ///
1259    /// # grafos_std::host::reset_mock();
1260    /// # grafos_std::host::mock_set_fbmu_arena_size(65536);
1261    /// let a = FabricTensor::from_slice(&[3], &[1.0, core::f32::consts::E, 10.0]).unwrap();
1262    /// let b = a.ln().unwrap();
1263    /// assert!((b.as_slice()[0] - 0.0).abs() < 1e-6);
1264    /// assert!((b.as_slice()[1] - 1.0).abs() < 1e-6);
1265    /// ```
1266    pub fn ln(&self) -> Result<FabricTensor> {
1267        #[cfg(feature = "gpu")]
1268        if self.is_gpu() {
1269            self.dispatch_gpu_unary("tensor_ln")?;
1270            let cpu_result = self.ln_cpu()?;
1271            return cpu_result.to_gpu();
1272        }
1273        self.ln_cpu()
1274    }
1275
1276    fn ln_cpu(&self) -> Result<FabricTensor> {
1277        let data: Vec<f32> = self.data.iter().map(|&x| x.ln()).collect();
1278        FabricTensor::from_slice(self.shape(), &data)
1279    }
1280
1281    /// Elementwise clamp to `[min, max]`.
1282    ///
1283    /// # Examples
1284    ///
1285    /// ```rust
1286    /// use grafos_tensor::FabricTensor;
1287    ///
1288    /// # grafos_std::host::reset_mock();
1289    /// # grafos_std::host::mock_set_fbmu_arena_size(65536);
1290    /// let a = FabricTensor::from_slice(&[4], &[-1.0, 0.5, 1.5, 3.0]).unwrap();
1291    /// let b = a.clip(0.0, 1.0).unwrap();
1292    /// assert_eq!(b.as_slice(), &[0.0, 0.5, 1.0, 1.0]);
1293    /// ```
1294    pub fn clip(&self, min: f32, max: f32) -> Result<FabricTensor> {
1295        let data: Vec<f32> = self.data.iter().map(|&x| x.clamp(min, max)).collect();
1296        FabricTensor::from_slice(self.shape(), &data)
1297    }
1298
1299    /// Forward FFT of a 1-D real-valued tensor.
1300    ///
1301    /// Input must be a 1-D tensor with a power-of-2 number of elements.
1302    /// Returns a 1-D tensor of length `2*N` containing interleaved `[re, im]`
1303    /// pairs for each frequency bin.
1304    ///
1305    /// On GPU-placed tensors (with `gpu` feature), dispatches through the GPU
1306    /// submit path before computing the CPU result.
1307    pub fn fft(&self) -> Result<FabricTensor> {
1308        if self.ndim() != 1 {
1309            return Err(FabricError::CapacityExceeded);
1310        }
1311        let n = self.shape.dims[0];
1312        if n == 0 || !n.is_power_of_two() {
1313            return Err(FabricError::CapacityExceeded);
1314        }
1315
1316        #[cfg(feature = "gpu")]
1317        if self.is_gpu() {
1318            self.dispatch_gpu_unary("tensor_fft")?;
1319            let cpu_result = self.fft_cpu()?;
1320            return cpu_result.to_gpu();
1321        }
1322
1323        self.fft_cpu()
1324    }
1325
1326    /// Inverse FFT returning a 1-D real-valued tensor.
1327    ///
1328    /// Input must be a 1-D tensor of length `2*N` containing interleaved
1329    /// `[re, im]` pairs. `N` must be a power of 2. Returns a 1-D tensor
1330    /// of `N` real samples.
1331    ///
1332    /// On GPU-placed tensors (with `gpu` feature), dispatches through the GPU
1333    /// submit path before computing the CPU result.
1334    pub fn ifft(&self) -> Result<FabricTensor> {
1335        if self.ndim() != 1 {
1336            return Err(FabricError::CapacityExceeded);
1337        }
1338        let len = self.shape.dims[0];
1339        if len == 0 || !len.is_multiple_of(2) {
1340            return Err(FabricError::CapacityExceeded);
1341        }
1342        let n = len / 2;
1343        if !n.is_power_of_two() {
1344            return Err(FabricError::CapacityExceeded);
1345        }
1346
1347        #[cfg(feature = "gpu")]
1348        if self.is_gpu() {
1349            self.dispatch_gpu_unary("tensor_ifft")?;
1350            let cpu_result = self.ifft_cpu()?;
1351            return cpu_result.to_gpu();
1352        }
1353
1354        self.ifft_cpu()
1355    }
1356
1357    fn fft_cpu(&self) -> Result<FabricTensor> {
1358        let n = self.shape.dims[0];
1359        // Build complex input from real data
1360        let mut complex = vec![(0.0f32, 0.0f32); n];
1361        for (i, &val) in self.data.iter().enumerate() {
1362            complex[i] = (val, 0.0);
1363        }
1364
1365        // Bit-reverse permutation
1366        let bits = n.trailing_zeros();
1367        for i in 0..n {
1368            let j = i.reverse_bits() >> (usize::BITS - bits);
1369            if i < j {
1370                complex.swap(i, j);
1371            }
1372        }
1373
1374        // Cooley-Tukey butterfly (forward)
1375        let mut len = 2;
1376        while len <= n {
1377            let half = len / 2;
1378            let angle = -2.0 * core::f32::consts::PI / len as f32;
1379            for start in (0..n).step_by(len) {
1380                let mut w_re = 1.0f32;
1381                let mut w_im = 0.0f32;
1382                let step_re = angle.cos();
1383                let step_im = angle.sin();
1384                for k in 0..half {
1385                    let (e_re, e_im) = complex[start + k];
1386                    let (o_re, o_im) = complex[start + k + half];
1387                    let tw_re = o_re * w_re - o_im * w_im;
1388                    let tw_im = o_re * w_im + o_im * w_re;
1389                    complex[start + k] = (e_re + tw_re, e_im + tw_im);
1390                    complex[start + k + half] = (e_re - tw_re, e_im - tw_im);
1391                    let new_w_re = w_re * step_re - w_im * step_im;
1392                    let new_w_im = w_re * step_im + w_im * step_re;
1393                    w_re = new_w_re;
1394                    w_im = new_w_im;
1395                }
1396            }
1397            len *= 2;
1398        }
1399
1400        // Pack as interleaved [re, im, re, im, ...]
1401        let mut out = Vec::with_capacity(n * 2);
1402        for &(re, im) in &complex {
1403            out.push(re);
1404            out.push(im);
1405        }
1406        FabricTensor::from_slice(&[n * 2], &out)
1407    }
1408
1409    fn ifft_cpu(&self) -> Result<FabricTensor> {
1410        let len = self.shape.dims[0];
1411        let n = len / 2;
1412
1413        // Unpack interleaved [re, im, ...] into complex pairs
1414        let mut complex: Vec<(f32, f32)> = (0..n)
1415            .map(|i| (self.data[i * 2], self.data[i * 2 + 1]))
1416            .collect();
1417
1418        // Bit-reverse permutation
1419        let bits = n.trailing_zeros();
1420        for i in 0..n {
1421            let j = i.reverse_bits() >> (usize::BITS - bits);
1422            if i < j {
1423                complex.swap(i, j);
1424            }
1425        }
1426
1427        // Cooley-Tukey butterfly (inverse: positive angle)
1428        let mut blen = 2;
1429        while blen <= n {
1430            let half = blen / 2;
1431            let angle = 2.0 * core::f32::consts::PI / blen as f32;
1432            for start in (0..n).step_by(blen) {
1433                let mut w_re = 1.0f32;
1434                let mut w_im = 0.0f32;
1435                let step_re = angle.cos();
1436                let step_im = angle.sin();
1437                for k in 0..half {
1438                    let (e_re, e_im) = complex[start + k];
1439                    let (o_re, o_im) = complex[start + k + half];
1440                    let tw_re = o_re * w_re - o_im * w_im;
1441                    let tw_im = o_re * w_im + o_im * w_re;
1442                    complex[start + k] = (e_re + tw_re, e_im + tw_im);
1443                    complex[start + k + half] = (e_re - tw_re, e_im - tw_im);
1444                    let new_w_re = w_re * step_re - w_im * step_im;
1445                    let new_w_im = w_re * step_im + w_im * step_re;
1446                    w_re = new_w_re;
1447                    w_im = new_w_im;
1448                }
1449            }
1450            blen *= 2;
1451        }
1452
1453        // Scale by 1/N and return real parts
1454        let scale = 1.0 / n as f32;
1455        let out: Vec<f32> = complex.iter().map(|&(re, _)| re * scale).collect();
1456        FabricTensor::from_slice(&[n], &out)
1457    }
1458}
1459
1460/// Elementwise addition via the `+` operator.
1461///
1462/// Usage: `(&a + &b).unwrap()`
1463///
1464/// Delegates to [`FabricTensor::add`]. Both tensors must have the same shape.
1465impl<'a> Add for &'a FabricTensor {
1466    type Output = Result<FabricTensor>;
1467
1468    fn add(self, rhs: &'a FabricTensor) -> Self::Output {
1469        self.add(rhs)
1470    }
1471}
1472
1473/// Elementwise multiplication via the `*` operator.
1474///
1475/// Usage: `(&a * &b).unwrap()`
1476///
1477/// Delegates to [`FabricTensor::mul`]. Both tensors must have the same shape.
1478impl<'a> Mul for &'a FabricTensor {
1479    type Output = Result<FabricTensor>;
1480
1481    fn mul(self, rhs: &'a FabricTensor) -> Self::Output {
1482        FabricTensor::mul(self, rhs)
1483    }
1484}
1485
1486/// Scalar multiplication via the `*` operator.
1487///
1488/// Usage: `(&a * 2.0f32).unwrap()`
1489///
1490/// Delegates to [`FabricTensor::scale`].
1491impl Mul<f32> for &FabricTensor {
1492    type Output = Result<FabricTensor>;
1493
1494    fn mul(self, rhs: f32) -> Self::Output {
1495        self.scale(rhs)
1496    }
1497}
1498
1499/// Elementwise subtraction via the `-` operator.
1500///
1501/// Usage: `(&a - &b).unwrap()`
1502///
1503/// Delegates to [`FabricTensor::subtract`]. Both tensors must have the same shape.
1504impl<'a> Sub for &'a FabricTensor {
1505    type Output = Result<FabricTensor>;
1506
1507    fn sub(self, rhs: &'a FabricTensor) -> Self::Output {
1508        self.subtract(rhs)
1509    }
1510}
1511
1512#[cfg(test)]
1513mod tests {
1514    use super::*;
1515    use grafos_std::host;
1516
1517    fn setup() {
1518        host::reset_mock();
1519        host::mock_set_fbmu_arena_size(1 << 20); // 1 MiB
1520    }
1521
1522    // ---- Creation and element access ----
1523
1524    #[test]
1525    fn zeros_creates_correct_shape() {
1526        setup();
1527        let t = FabricTensor::zeros(&[3, 4]).unwrap();
1528        assert_eq!(t.shape(), &[3, 4]);
1529        assert_eq!(t.ndim(), 2);
1530        assert_eq!(t.numel(), 12);
1531        assert_eq!(t.strides(), &[4, 1]);
1532        for i in 0..3 {
1533            for j in 0..4 {
1534                assert_eq!(t.get(&[i, j]).unwrap(), 0.0);
1535            }
1536        }
1537    }
1538
1539    #[test]
1540    fn from_slice_roundtrip() {
1541        setup();
1542        let data: Vec<f32> = (1..=6).map(|x| x as f32).collect();
1543        let t = FabricTensor::from_slice(&[2, 3], &data).unwrap();
1544        assert_eq!(t.shape(), &[2, 3]);
1545        assert_eq!(t.get(&[0, 0]).unwrap(), 1.0);
1546        assert_eq!(t.get(&[0, 2]).unwrap(), 3.0);
1547        assert_eq!(t.get(&[1, 0]).unwrap(), 4.0);
1548        assert_eq!(t.get(&[1, 2]).unwrap(), 6.0);
1549    }
1550
1551    #[test]
1552    fn from_slice_wrong_size() {
1553        setup();
1554        let result = FabricTensor::from_slice(&[2, 3], &[1.0, 2.0]);
1555        assert!(result.is_err());
1556    }
1557
1558    #[test]
1559    fn get_out_of_bounds() {
1560        setup();
1561        let t = FabricTensor::zeros(&[2, 3]).unwrap();
1562        assert!(t.get(&[2, 0]).is_err());
1563        assert!(t.get(&[0, 3]).is_err());
1564        assert!(t.get(&[0]).is_err()); // wrong ndim
1565    }
1566
1567    #[test]
1568    fn set_element() {
1569        setup();
1570        let mut t = FabricTensor::zeros(&[2, 2]).unwrap();
1571        t.set(&[1, 0], 42.0).unwrap();
1572        assert_eq!(t.get(&[1, 0]).unwrap(), 42.0);
1573        assert_eq!(t.get(&[0, 0]).unwrap(), 0.0);
1574    }
1575
1576    // ---- Matmul ----
1577
1578    #[test]
1579    fn matmul_2x3_times_3x2() {
1580        setup();
1581        // A = [[1,2,3],[4,5,6]]
1582        let a = FabricTensor::from_slice(&[2, 3], &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
1583        // B = [[7,8],[9,10],[11,12]]
1584        let b = FabricTensor::from_slice(&[3, 2], &[7.0, 8.0, 9.0, 10.0, 11.0, 12.0]).unwrap();
1585        let c = a.matmul(&b).unwrap();
1586        assert_eq!(c.shape(), &[2, 2]);
1587        // C[0,0] = 1*7 + 2*9 + 3*11 = 58
1588        assert_eq!(c.get(&[0, 0]).unwrap(), 58.0);
1589        // C[0,1] = 1*8 + 2*10 + 3*12 = 64
1590        assert_eq!(c.get(&[0, 1]).unwrap(), 64.0);
1591        // C[1,0] = 4*7 + 5*9 + 6*11 = 139
1592        assert_eq!(c.get(&[1, 0]).unwrap(), 139.0);
1593        // C[1,1] = 4*8 + 5*10 + 6*12 = 154
1594        assert_eq!(c.get(&[1, 1]).unwrap(), 154.0);
1595    }
1596
1597    #[test]
1598    fn matmul_incompatible_dims() {
1599        setup();
1600        let a = FabricTensor::zeros(&[2, 3]).unwrap();
1601        let b = FabricTensor::zeros(&[2, 3]).unwrap();
1602        assert!(a.matmul(&b).is_err());
1603    }
1604
1605    #[test]
1606    fn matmul_not_2d() {
1607        setup();
1608        let a = FabricTensor::zeros(&[2, 3, 4]).unwrap();
1609        let b = FabricTensor::zeros(&[4, 2]).unwrap();
1610        assert!(a.matmul(&b).is_err());
1611    }
1612
1613    // ---- Elementwise add ----
1614
1615    #[test]
1616    fn add_elementwise() {
1617        setup();
1618        let a = FabricTensor::from_slice(&[2, 2], &[1.0, 2.0, 3.0, 4.0]).unwrap();
1619        let b = FabricTensor::from_slice(&[2, 2], &[10.0, 20.0, 30.0, 40.0]).unwrap();
1620        let c = a.add(&b).unwrap();
1621        assert_eq!(c.as_slice(), &[11.0, 22.0, 33.0, 44.0]);
1622    }
1623
1624    #[test]
1625    fn add_shape_mismatch() {
1626        setup();
1627        let a = FabricTensor::zeros(&[2, 3]).unwrap();
1628        let b = FabricTensor::zeros(&[3, 2]).unwrap();
1629        assert!(a.add(&b).is_err());
1630    }
1631
1632    // ---- Elementwise mul ----
1633
1634    #[test]
1635    fn mul_elementwise() {
1636        setup();
1637        let a = FabricTensor::from_slice(&[3], &[2.0, 3.0, 4.0]).unwrap();
1638        let b = FabricTensor::from_slice(&[3], &[5.0, 6.0, 7.0]).unwrap();
1639        let c = FabricTensor::mul(&a, &b).unwrap();
1640        assert_eq!(c.as_slice(), &[10.0, 18.0, 28.0]);
1641    }
1642
1643    // ---- Scale ----
1644
1645    #[test]
1646    fn scale_scalar() {
1647        setup();
1648        let a = FabricTensor::from_slice(&[2, 2], &[1.0, 2.0, 3.0, 4.0]).unwrap();
1649        let b = a.scale(3.0).unwrap();
1650        assert_eq!(b.as_slice(), &[3.0, 6.0, 9.0, 12.0]);
1651    }
1652
1653    // ---- ReLU ----
1654
1655    #[test]
1656    fn relu_clamps_negatives() {
1657        setup();
1658        let a = FabricTensor::from_slice(&[4], &[-2.0, -0.5, 0.0, 3.0]).unwrap();
1659        let b = a.relu().unwrap();
1660        assert_eq!(b.as_slice(), &[0.0, 0.0, 0.0, 3.0]);
1661    }
1662
1663    // ---- Softmax ----
1664
1665    #[test]
1666    fn softmax_sums_to_one() {
1667        setup();
1668        let a = FabricTensor::from_slice(&[1, 4], &[1.0, 2.0, 3.0, 4.0]).unwrap();
1669        let b = a.softmax(1).unwrap();
1670        let sum: f32 = b.as_slice().iter().sum();
1671        assert!((sum - 1.0).abs() < 1e-6, "softmax sum = {sum}");
1672        // Values should be monotonically increasing.
1673        let s = b.as_slice();
1674        assert!(s[0] < s[1]);
1675        assert!(s[1] < s[2]);
1676        assert!(s[2] < s[3]);
1677    }
1678
1679    #[test]
1680    fn softmax_2d_along_axis0() {
1681        setup();
1682        // 2x3 matrix, softmax along axis 0 (column-wise).
1683        let a = FabricTensor::from_slice(&[2, 3], &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
1684        let b = a.softmax(0).unwrap();
1685        let s = b.as_slice();
1686        // Each column should sum to 1.
1687        for col in 0..3 {
1688            let col_sum = s[col] + s[3 + col];
1689            assert!((col_sum - 1.0).abs() < 1e-6, "column {col} sum = {col_sum}");
1690        }
1691    }
1692
1693    #[test]
1694    fn softmax_invalid_axis() {
1695        setup();
1696        let a = FabricTensor::zeros(&[2, 3]).unwrap();
1697        assert!(a.softmax(2).is_err());
1698    }
1699
1700    // ---- Transpose ----
1701
1702    #[test]
1703    fn transpose_2d() {
1704        setup();
1705        // [[1,2,3],[4,5,6]] -> [[1,4],[2,5],[3,6]]
1706        let a = FabricTensor::from_slice(&[2, 3], &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
1707        let b = a.transpose().unwrap();
1708        assert_eq!(b.shape(), &[3, 2]);
1709        assert_eq!(b.get(&[0, 0]).unwrap(), 1.0);
1710        assert_eq!(b.get(&[0, 1]).unwrap(), 4.0);
1711        assert_eq!(b.get(&[1, 0]).unwrap(), 2.0);
1712        assert_eq!(b.get(&[1, 1]).unwrap(), 5.0);
1713        assert_eq!(b.get(&[2, 0]).unwrap(), 3.0);
1714        assert_eq!(b.get(&[2, 1]).unwrap(), 6.0);
1715    }
1716
1717    #[test]
1718    fn transpose_1d_fails() {
1719        setup();
1720        let a = FabricTensor::zeros(&[5]).unwrap();
1721        assert!(a.transpose().is_err());
1722    }
1723
1724    // ---- Reshape ----
1725
1726    #[test]
1727    fn reshape_preserves_data() {
1728        setup();
1729        let a = FabricTensor::from_slice(&[2, 3], &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
1730        let b = a.reshape(&[3, 2]).unwrap();
1731        assert_eq!(b.shape(), &[3, 2]);
1732        assert_eq!(b.as_slice(), a.as_slice());
1733    }
1734
1735    #[test]
1736    fn reshape_wrong_numel() {
1737        setup();
1738        let a = FabricTensor::zeros(&[2, 3]).unwrap();
1739        assert!(a.reshape(&[2, 2]).is_err());
1740    }
1741
1742    // ---- Operator overloading ----
1743
1744    #[test]
1745    fn op_add() {
1746        setup();
1747        let a = FabricTensor::from_slice(&[3], &[1.0, 2.0, 3.0]).unwrap();
1748        let b = FabricTensor::from_slice(&[3], &[4.0, 5.0, 6.0]).unwrap();
1749        let c = (&a + &b).unwrap();
1750        assert_eq!(c.as_slice(), &[5.0, 7.0, 9.0]);
1751    }
1752
1753    #[test]
1754    fn op_mul_elementwise() {
1755        setup();
1756        let a = FabricTensor::from_slice(&[3], &[2.0, 3.0, 4.0]).unwrap();
1757        let b = FabricTensor::from_slice(&[3], &[5.0, 6.0, 7.0]).unwrap();
1758        let c = (&a * &b).unwrap();
1759        assert_eq!(c.as_slice(), &[10.0, 18.0, 28.0]);
1760    }
1761
1762    #[test]
1763    fn op_mul_scalar() {
1764        setup();
1765        let a = FabricTensor::from_slice(&[2], &[3.0, 4.0]).unwrap();
1766        let b = (&a * 2.0).unwrap();
1767        assert_eq!(b.as_slice(), &[6.0, 8.0]);
1768    }
1769
1770    // ---- 3D tensor ----
1771
1772    #[test]
1773    fn tensor_3d_access() {
1774        setup();
1775        // 2x3x4 tensor
1776        let data: Vec<f32> = (0..24).map(|x| x as f32).collect();
1777        let t = FabricTensor::from_slice(&[2, 3, 4], &data).unwrap();
1778        assert_eq!(t.ndim(), 3);
1779        assert_eq!(t.numel(), 24);
1780        assert_eq!(t.strides(), &[12, 4, 1]);
1781        // t[1][2][3] = 1*12 + 2*4 + 3 = 23
1782        assert_eq!(t.get(&[1, 2, 3]).unwrap(), 23.0);
1783    }
1784
1785    // ---- Scalar (0-D) ----
1786
1787    #[test]
1788    fn scalar_tensor() {
1789        setup();
1790        let t = FabricTensor::from_slice(&[], &[42.0]).unwrap();
1791        assert_eq!(t.ndim(), 0);
1792        assert_eq!(t.numel(), 1);
1793        assert_eq!(t.get(&[]).unwrap(), 42.0);
1794    }
1795
1796    #[test]
1797    fn from_mem_lease_wraps_existing() {
1798        setup();
1799        let lease = MemBuilder::new().min_bytes(16).acquire().unwrap();
1800        let t = FabricTensor::from_mem_lease(&[2, 2], lease);
1801        assert_eq!(t.shape(), &[2, 2]);
1802        assert_eq!(t.numel(), 4);
1803        assert!(t.is_cpu());
1804    }
1805
1806    // ---- 3D transpose ----
1807
1808    #[test]
1809    fn transpose_3d_swaps_last_two() {
1810        setup();
1811        // Shape [2,3,4] -> [2,4,3]
1812        let data: Vec<f32> = (0..24).map(|x| x as f32).collect();
1813        let t = FabricTensor::from_slice(&[2, 3, 4], &data).unwrap();
1814        let t2 = t.transpose().unwrap();
1815        assert_eq!(t2.shape(), &[2, 4, 3]);
1816        // t[0][1][2] = 0*12 + 1*4 + 2 = 6 -> t2[0][2][1] = 6
1817        assert_eq!(t2.get(&[0, 2, 1]).unwrap(), 6.0);
1818        // t[1][0][3] = 1*12 + 0*4 + 3 = 15 -> t2[1][3][0] = 15
1819        assert_eq!(t2.get(&[1, 3, 0]).unwrap(), 15.0);
1820    }
1821
1822    #[test]
1823    fn default_device_is_cpu() {
1824        setup();
1825        let t = FabricTensor::zeros(&[2, 2]).unwrap();
1826        assert_eq!(t.device(), Device::Cpu);
1827        assert!(t.is_cpu());
1828        assert!(!t.is_gpu());
1829    }
1830
1831    #[test]
1832    #[cfg(not(feature = "gpu"))]
1833    fn to_gpu_without_feature_is_unsupported() {
1834        setup();
1835        let t = FabricTensor::zeros(&[2, 2]).unwrap();
1836        assert!(matches!(t.to_gpu(), Err(FabricError::Unsupported)));
1837    }
1838
1839    #[test]
1840    #[cfg(feature = "gpu")]
1841    fn to_gpu_to_cpu_roundtrip_preserves_data() {
1842        setup();
1843        let t = FabricTensor::from_slice(&[2, 2], &[1.0, 2.0, 3.0, 4.0]).unwrap();
1844        let gpu_t = t.to_gpu().unwrap();
1845        assert!(gpu_t.is_gpu());
1846        assert!(!gpu_t.is_cpu());
1847
1848        let cpu_t = gpu_t.to_cpu().unwrap();
1849        assert!(cpu_t.is_cpu());
1850        assert_eq!(cpu_t.shape(), &[2, 2]);
1851        assert_eq!(cpu_t.as_slice(), &[1.0, 2.0, 3.0, 4.0]);
1852    }
1853
1854    #[test]
1855    #[cfg(feature = "gpu")]
1856    fn gpu_matmul_dispatches_when_both_operands_are_gpu() {
1857        setup();
1858        let a = FabricTensor::from_slice(&[2, 2], &[1.0, 2.0, 3.0, 4.0])
1859            .unwrap()
1860            .to_gpu()
1861            .unwrap();
1862        let b = FabricTensor::from_slice(&[2, 2], &[5.0, 6.0, 7.0, 8.0])
1863            .unwrap()
1864            .to_gpu()
1865            .unwrap();
1866        host::test_mock::_set_gpu_session_error(Some(-1));
1867        assert!(matches!(a.matmul(&b), Err(FabricError::Disconnected)));
1868        host::test_mock::_set_gpu_session_error(None);
1869    }
1870
1871    #[test]
1872    #[cfg(feature = "gpu")]
1873    fn gpu_binary_ops_return_gpu_placed_result() {
1874        setup();
1875        let a = FabricTensor::from_slice(&[2, 2], &[1.0, 2.0, 3.0, 4.0])
1876            .unwrap()
1877            .to_gpu()
1878            .unwrap();
1879        let b = FabricTensor::from_slice(&[2, 2], &[5.0, 6.0, 7.0, 8.0])
1880            .unwrap()
1881            .to_gpu()
1882            .unwrap();
1883        let result = a.add(&b).unwrap();
1884        assert!(result.is_gpu());
1885        assert_eq!(result.shape(), &[2, 2]);
1886        assert_eq!(result.as_slice(), &[6.0, 8.0, 10.0, 12.0]);
1887    }
1888
1889    #[test]
1890    #[cfg(feature = "gpu")]
1891    fn gpu_unary_ops_return_gpu_placed_result() {
1892        setup();
1893        let a = FabricTensor::from_slice(&[4], &[-2.0, -0.5, 0.0, 3.0])
1894            .unwrap()
1895            .to_gpu()
1896            .unwrap();
1897        let result = a.relu().unwrap();
1898        assert!(result.is_gpu());
1899        assert_eq!(result.shape(), &[4]);
1900        assert_eq!(result.as_slice(), &[0.0, 0.0, 0.0, 3.0]);
1901    }
1902
1903    // ---- FFT / IFFT ----
1904
1905    #[test]
1906    fn fft_ifft_roundtrip() {
1907        setup();
1908        let input = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
1909        let t = FabricTensor::from_slice(&[8], &input).unwrap();
1910        let freq = t.fft().unwrap();
1911        assert_eq!(freq.shape(), &[16]); // 8 complex bins = 16 floats
1912        let recovered = freq.ifft().unwrap();
1913        assert_eq!(recovered.shape(), &[8]);
1914        for (i, &val) in recovered.as_slice().iter().enumerate() {
1915            assert!(
1916                (val - input[i]).abs() < 1e-4,
1917                "sample {i}: expected {}, got {val}",
1918                input[i]
1919            );
1920        }
1921    }
1922
1923    #[test]
1924    fn fft_not_1d_fails() {
1925        setup();
1926        let t = FabricTensor::zeros(&[2, 4]).unwrap();
1927        assert!(t.fft().is_err());
1928    }
1929
1930    #[test]
1931    fn fft_non_power_of_two_fails() {
1932        setup();
1933        let t = FabricTensor::from_slice(&[3], &[1.0, 2.0, 3.0]).unwrap();
1934        assert!(t.fft().is_err());
1935    }
1936
1937    #[test]
1938    #[cfg(feature = "gpu")]
1939    fn gpu_fft_dispatches_and_produces_correct_result() {
1940        setup();
1941        let input = vec![1.0, 0.0, -1.0, 0.0];
1942        let t = FabricTensor::from_slice(&[4], &input)
1943            .unwrap()
1944            .to_gpu()
1945            .unwrap();
1946        let freq = t.fft().unwrap();
1947        assert!(freq.is_gpu());
1948        assert_eq!(freq.shape(), &[8]); // 4 bins * 2 floats
1949
1950        // Verify roundtrip through IFFT
1951        let recovered = freq.ifft().unwrap();
1952        assert!(recovered.is_gpu());
1953        assert_eq!(recovered.shape(), &[4]);
1954        for (i, &val) in recovered.as_slice().iter().enumerate() {
1955            assert!(
1956                (val - input[i]).abs() < 1e-4,
1957                "sample {i}: expected {}, got {val}",
1958                input[i]
1959            );
1960        }
1961    }
1962
1963    // ---- Subtract ----
1964
1965    #[test]
1966    fn subtract_elementwise() {
1967        setup();
1968        let a = FabricTensor::from_slice(&[3], &[5.0, 3.0, 1.0]).unwrap();
1969        let b = FabricTensor::from_slice(&[3], &[1.0, 2.0, 3.0]).unwrap();
1970        let c = a.subtract(&b).unwrap();
1971        assert_eq!(c.as_slice(), &[4.0, 1.0, -2.0]);
1972    }
1973
1974    #[test]
1975    fn subtract_operator() {
1976        setup();
1977        let a = FabricTensor::from_slice(&[2], &[10.0, 5.0]).unwrap();
1978        let b = FabricTensor::from_slice(&[2], &[3.0, 7.0]).unwrap();
1979        let c = (&a - &b).unwrap();
1980        assert_eq!(c.as_slice(), &[7.0, -2.0]);
1981    }
1982
1983    #[test]
1984    fn subtract_shape_mismatch() {
1985        setup();
1986        let a = FabricTensor::from_slice(&[2], &[1.0, 2.0]).unwrap();
1987        let b = FabricTensor::from_slice(&[3], &[1.0, 2.0, 3.0]).unwrap();
1988        assert!(a.subtract(&b).is_err());
1989    }
1990
1991    // ---- Sum axis ----
1992
1993    #[test]
1994    fn sum_axis_row_sums() {
1995        setup();
1996        let a = FabricTensor::from_slice(&[2, 3], &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
1997        let s = a.sum_axis(1).unwrap();
1998        assert_eq!(s.shape(), &[2]);
1999        assert_eq!(s.as_slice(), &[6.0, 15.0]);
2000    }
2001
2002    #[test]
2003    fn sum_axis_col_sums() {
2004        setup();
2005        let a = FabricTensor::from_slice(&[2, 3], &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
2006        let s = a.sum_axis(0).unwrap();
2007        assert_eq!(s.shape(), &[3]);
2008        assert_eq!(s.as_slice(), &[5.0, 7.0, 9.0]);
2009    }
2010
2011    #[test]
2012    fn sum_axis_1d() {
2013        setup();
2014        let a = FabricTensor::from_slice(&[4], &[1.0, 2.0, 3.0, 4.0]).unwrap();
2015        let s = a.sum_axis(0).unwrap();
2016        assert_eq!(s.shape(), &[1]);
2017        assert_eq!(s.as_slice(), &[10.0]);
2018    }
2019
2020    #[test]
2021    fn sum_axis_out_of_bounds() {
2022        setup();
2023        let a = FabricTensor::from_slice(&[2, 3], &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
2024        assert!(a.sum_axis(2).is_err());
2025    }
2026
2027    // ---- Sigmoid ----
2028
2029    #[test]
2030    fn sigmoid_values() {
2031        setup();
2032        let a = FabricTensor::from_slice(&[3], &[0.0, 100.0, -100.0]).unwrap();
2033        let b = a.sigmoid().unwrap();
2034        assert!((b.as_slice()[0] - 0.5).abs() < 1e-6);
2035        assert!((b.as_slice()[1] - 1.0).abs() < 1e-4);
2036        assert!(b.as_slice()[2] < 1e-4);
2037    }
2038
2039    // ---- Ln ----
2040
2041    #[test]
2042    fn ln_values() {
2043        setup();
2044        let a = FabricTensor::from_slice(&[3], &[1.0, core::f32::consts::E, 10.0]).unwrap();
2045        let b = a.ln().unwrap();
2046        assert!((b.as_slice()[0] - 0.0).abs() < 1e-6);
2047        assert!((b.as_slice()[1] - 1.0).abs() < 1e-5);
2048        assert!((b.as_slice()[2] - 10.0f32.ln()).abs() < 1e-5);
2049    }
2050
2051    // ---- Clip ----
2052
2053    #[test]
2054    fn clip_values() {
2055        setup();
2056        let a = FabricTensor::from_slice(&[4], &[-1.0, 0.5, 1.5, 3.0]).unwrap();
2057        let b = a.clip(0.0, 1.0).unwrap();
2058        assert_eq!(b.as_slice(), &[0.0, 0.5, 1.0, 1.0]);
2059    }
2060}