cvx_analytics/
motifs.rs

1//! Temporal motif discovery via Matrix Profile.
2//!
3//! Detects **recurring patterns** (motifs) and **anomalous subsequences** (discords)
4//! in entity trajectories. Adapted from the STOMP algorithm (Zhu et al., 2016)
5//! to operate on multi-dimensional embedding trajectories.
6//!
7//! # Key insight
8//!
9//! Operating on region trajectories (K ≈ 60-80 dims) rather than raw embeddings
10//! (D = 768) makes the Matrix Profile tractable: O(N² × K) for a 500-window
11//! trajectory with K=80 is ~20M operations.
12//!
13//! # References
14//!
15//! - Yeh, C.-C. M. et al. (2016). Matrix Profile I. *IEEE ICDM*.
16//! - Zhu, Y. et al. (2016). Matrix Profile II: STOMP. *IEEE ICDM*.
17
18use cvx_core::error::AnalyticsError;
19
20// ─── Types ──────────────────────────────────────────────────────────
21
22/// A discovered recurring pattern in a trajectory.
23#[derive(Debug, Clone)]
24pub struct Motif {
25    /// Index of the canonical occurrence in the trajectory.
26    pub canonical_index: usize,
27    /// All occurrences: `(start_index, distance_to_canonical)`.
28    pub occurrences: Vec<MotifOccurrence>,
29    /// Detected period in number of steps (if regular). `None` if aperiodic.
30    pub period: Option<usize>,
31    /// Mean distance across all occurrences to canonical.
32    pub mean_match_distance: f32,
33}
34
35/// A single occurrence of a motif.
36#[derive(Debug, Clone)]
37pub struct MotifOccurrence {
38    /// Start index in the trajectory.
39    pub start_index: usize,
40    /// Timestamp of the start of this occurrence.
41    pub timestamp: i64,
42    /// Distance to the canonical motif.
43    pub distance: f32,
44}
45
46/// An anomalous subsequence (most unlike anything else in the trajectory).
47#[derive(Debug, Clone)]
48pub struct Discord {
49    /// Start index in the trajectory.
50    pub start_index: usize,
51    /// Timestamp of the start.
52    pub timestamp: i64,
53    /// Matrix Profile value (nearest-neighbor distance — high = unusual).
54    pub nn_distance: f32,
55}
56
57// ─── Matrix Profile computation ─────────────────────────────────────
58
59/// Compute the Matrix Profile for a multi-dimensional trajectory.
60///
61/// Returns `(profile, profile_index)` where:
62/// - `profile[i]` = distance to the nearest non-trivial match of subsequence i
63/// - `profile_index[i]` = index of that nearest match
64///
65/// Uses the STOMP-like approach: for each subsequence, compute distances to
66/// all other non-overlapping subsequences and record the minimum.
67///
68/// # Arguments
69///
70/// * `trajectory` — Time-ordered `(timestamp, vector)` pairs
71/// * `window` — Subsequence length (number of time steps)
72/// * `exclusion_zone` — Fraction of window size to exclude around each
73///   subsequence to avoid trivial matches (default: 0.5)
74///
75/// # Errors
76///
77/// Returns [`AnalyticsError::InsufficientData`] if trajectory length < 2 * window.
78pub fn matrix_profile(
79    trajectory: &[(i64, &[f32])],
80    window: usize,
81    exclusion_zone: f32,
82) -> Result<(Vec<f32>, Vec<usize>), AnalyticsError> {
83    let n = trajectory.len();
84    if n < 2 * window {
85        return Err(AnalyticsError::InsufficientData {
86            needed: 2 * window,
87            have: n,
88        });
89    }
90
91    let n_subs = n - window + 1;
92    let excl = ((window as f32) * exclusion_zone).ceil() as usize;
93
94    let mut profile = vec![f32::MAX; n_subs];
95    let mut profile_index = vec![0usize; n_subs];
96
97    for i in 0..n_subs {
98        for j in 0..n_subs {
99            // Skip if within exclusion zone
100            if i.abs_diff(j) < excl {
101                continue;
102            }
103
104            let dist = subsequence_distance(trajectory, i, j, window);
105
106            if dist < profile[i] {
107                profile[i] = dist;
108                profile_index[i] = j;
109            }
110        }
111    }
112
113    Ok((profile, profile_index))
114}
115
116/// Euclidean distance between two subsequences of the trajectory.
117fn subsequence_distance(trajectory: &[(i64, &[f32])], i: usize, j: usize, window: usize) -> f32 {
118    let mut sum_sq = 0.0f64;
119    for step in 0..window {
120        let vi = trajectory[i + step].1;
121        let vj = trajectory[j + step].1;
122        for (a, b) in vi.iter().zip(vj.iter()) {
123            let diff = (*a as f64) - (*b as f64);
124            sum_sq += diff * diff;
125        }
126    }
127    (sum_sq / window as f64).sqrt() as f32
128}
129
130// ─── Motif discovery ────────────────────────────────────────────────
131
132/// Discover the top-k recurring motifs in a trajectory.
133///
134/// A motif is the subsequence with the smallest Matrix Profile value
135/// (i.e., it has a very similar match elsewhere). We follow the Matrix
136/// Profile Index chain to find all occurrences.
137///
138/// # Arguments
139///
140/// * `trajectory` — Time-ordered `(timestamp, vector)` pairs
141/// * `window` — Subsequence length
142/// * `max_motifs` — Maximum number of motifs to return
143/// * `exclusion_zone` — Fraction of window for non-trivial matches (default: 0.5)
144///
145/// # Errors
146///
147/// Returns [`AnalyticsError::InsufficientData`] if trajectory too short.
148pub fn discover_motifs(
149    trajectory: &[(i64, &[f32])],
150    window: usize,
151    max_motifs: usize,
152    exclusion_zone: f32,
153) -> Result<Vec<Motif>, AnalyticsError> {
154    let (profile, profile_index) = matrix_profile(trajectory, window, exclusion_zone)?;
155    let n_subs = profile.len();
156    let excl = ((window as f32) * exclusion_zone).ceil() as usize;
157
158    // Sort by profile value (smallest = best motif)
159    let mut indices: Vec<usize> = (0..n_subs).collect();
160    indices.sort_by(|&a, &b| profile[a].partial_cmp(&profile[b]).unwrap());
161
162    let mut motifs: Vec<Motif> = Vec::new();
163    let mut used = vec![false; n_subs]; // prevent overlapping motifs
164
165    for &candidate in &indices {
166        if motifs.len() >= max_motifs {
167            break;
168        }
169        if used[candidate] || profile[candidate] == f32::MAX {
170            continue;
171        }
172
173        // Collect all occurrences: subsequences within 2× the motif distance
174        let threshold = (profile[candidate] * 2.0).max(1e-6);
175        let mut occurrences: Vec<MotifOccurrence> = Vec::new();
176
177        // The canonical occurrence
178        occurrences.push(MotifOccurrence {
179            start_index: candidate,
180            timestamp: trajectory[candidate].0,
181            distance: 0.0,
182        });
183
184        // The nearest match
185        let nn = profile_index[candidate];
186        occurrences.push(MotifOccurrence {
187            start_index: nn,
188            timestamp: trajectory[nn].0,
189            distance: profile[candidate],
190        });
191
192        // Find additional occurrences
193        for k in 0..n_subs {
194            if k == candidate || k == nn {
195                continue;
196            }
197            if k.abs_diff(candidate) < excl || k.abs_diff(nn) < excl {
198                continue;
199            }
200            let dist = subsequence_distance(trajectory, candidate, k, window);
201            if dist <= threshold {
202                occurrences.push(MotifOccurrence {
203                    start_index: k,
204                    timestamp: trajectory[k].0,
205                    distance: dist,
206                });
207            }
208        }
209
210        // Mark all occurrences as used
211        for occ in &occurrences {
212            let start = occ.start_index.saturating_sub(excl);
213            let end = (occ.start_index + excl).min(n_subs);
214            for flag in used.iter_mut().take(end).skip(start) {
215                *flag = true;
216            }
217        }
218
219        // Sort occurrences by index (temporal order)
220        occurrences.sort_by_key(|o| o.start_index);
221
222        // Detect periodicity
223        let period = detect_period(&occurrences);
224
225        let mean_match_distance = if occurrences.len() > 1 {
226            occurrences.iter().map(|o| o.distance).sum::<f32>() / occurrences.len() as f32
227        } else {
228            0.0
229        };
230
231        motifs.push(Motif {
232            canonical_index: candidate,
233            occurrences,
234            period,
235            mean_match_distance,
236        });
237    }
238
239    Ok(motifs)
240}
241
242// ─── Discord discovery ──────────────────────────────────────────────
243
244/// Discover the top-k anomalous subsequences (discords).
245///
246/// A discord is the subsequence with the **largest** Matrix Profile value
247/// (i.e., it is most unlike any other subsequence).
248///
249/// # Arguments
250///
251/// * `trajectory` — Time-ordered `(timestamp, vector)` pairs
252/// * `window` — Subsequence length
253/// * `max_discords` — Maximum number of discords to return
254///
255/// # Errors
256///
257/// Returns [`AnalyticsError::InsufficientData`] if trajectory too short.
258pub fn discover_discords(
259    trajectory: &[(i64, &[f32])],
260    window: usize,
261    max_discords: usize,
262) -> Result<Vec<Discord>, AnalyticsError> {
263    let (profile, _) = matrix_profile(trajectory, window, 0.5)?;
264    let n_subs = profile.len();
265    let excl = ((window as f32) * 0.5).ceil() as usize;
266
267    // Sort by profile value descending (largest = most anomalous)
268    let mut indices: Vec<usize> = (0..n_subs).collect();
269    indices.sort_by(|&a, &b| profile[b].partial_cmp(&profile[a]).unwrap());
270
271    let mut discords: Vec<Discord> = Vec::new();
272    let mut used = vec![false; n_subs];
273
274    for &candidate in &indices {
275        if discords.len() >= max_discords {
276            break;
277        }
278        if used[candidate] || profile[candidate] == f32::MAX {
279            continue;
280        }
281
282        discords.push(Discord {
283            start_index: candidate,
284            timestamp: trajectory[candidate].0,
285            nn_distance: profile[candidate],
286        });
287
288        // Mark exclusion zone
289        let start = candidate.saturating_sub(excl);
290        let end = (candidate + excl).min(n_subs);
291        for flag in used.iter_mut().take(end).skip(start) {
292            *flag = true;
293        }
294    }
295
296    Ok(discords)
297}
298
299// ─── Helpers ────────────────────────────────────────────────────────
300
301/// Detect periodicity from occurrence indices.
302///
303/// If gaps between consecutive occurrences are within 20% of the median gap,
304/// returns the median gap as the period. Otherwise returns `None`.
305fn detect_period(occurrences: &[MotifOccurrence]) -> Option<usize> {
306    if occurrences.len() < 3 {
307        return None;
308    }
309
310    let gaps: Vec<usize> = occurrences
311        .windows(2)
312        .map(|w| w[1].start_index - w[0].start_index)
313        .collect();
314
315    if gaps.is_empty() {
316        return None;
317    }
318
319    let mut sorted_gaps = gaps.clone();
320    sorted_gaps.sort();
321    let median_gap = sorted_gaps[sorted_gaps.len() / 2];
322
323    if median_gap == 0 {
324        return None;
325    }
326
327    // Check regularity: all gaps within 20% of median
328    let tolerance = (median_gap as f64 * 0.2).max(1.0) as usize;
329    let is_regular = gaps.iter().all(|&g| g.abs_diff(median_gap) <= tolerance);
330
331    if is_regular { Some(median_gap) } else { None }
332}
333
334// ─── Tests ──────────────────────────────────────────────────────────
335
336#[cfg(test)]
337#[allow(clippy::needless_range_loop)]
338mod tests {
339    use super::*;
340
341    fn as_refs(points: &[(i64, Vec<f32>)]) -> Vec<(i64, &[f32])> {
342        points.iter().map(|(t, v)| (*t, v.as_slice())).collect()
343    }
344
345    /// Build a trajectory with a planted motif that repeats at known positions.
346    fn trajectory_with_planted_motif() -> Vec<(i64, Vec<f32>)> {
347        let dim = 4;
348        let n = 100;
349        let mut traj = Vec::with_capacity(n);
350
351        for i in 0..n {
352            let t = i as i64 * 1000;
353            // Base: slowly varying signal
354            let base: Vec<f32> = (0..dim).map(|d| (i as f32 * 0.01 + d as f32)).collect();
355            traj.push((t, base));
356        }
357
358        // Plant a motif at positions 10, 40, 70 (period = 30)
359        let motif_pattern: Vec<Vec<f32>> = (0..5)
360            .map(|step| {
361                (0..dim)
362                    .map(|d| 10.0 + (step as f32 * 0.5) + d as f32 * 0.1)
363                    .collect()
364            })
365            .collect();
366
367        for &start in &[10, 40, 70] {
368            for (step, pattern) in motif_pattern.iter().enumerate() {
369                traj[start + step].1 = pattern.clone();
370            }
371        }
372
373        traj
374    }
375
376    // ─── matrix_profile ─────────────────────────────────────────
377
378    #[test]
379    fn matrix_profile_insufficient_data() {
380        let owned = vec![(0i64, vec![1.0f32]), (1, vec![2.0])];
381        let traj = as_refs(&owned);
382        let result = matrix_profile(&traj, 3, 0.5);
383        assert!(result.is_err());
384    }
385
386    #[test]
387    fn matrix_profile_basic() {
388        // Simple trajectory with 10 points
389        let owned: Vec<(i64, Vec<f32>)> = (0..10)
390            .map(|i| (i as i64 * 1000, vec![i as f32 * 0.1]))
391            .collect();
392        let traj = as_refs(&owned);
393
394        let (profile, index) = matrix_profile(&traj, 3, 0.5).unwrap();
395        assert_eq!(profile.len(), 8); // n - window + 1 = 10 - 3 + 1
396        assert_eq!(index.len(), 8);
397
398        // All profile values should be finite and positive
399        for &val in &profile {
400            assert!(val.is_finite());
401            assert!(val >= 0.0);
402        }
403    }
404
405    #[test]
406    fn matrix_profile_identical_subsequences() {
407        // Trajectory where positions 0-2 and 5-7 are identical
408        let mut owned: Vec<(i64, Vec<f32>)> =
409            (0..10).map(|i| (i as i64 * 1000, vec![i as f32])).collect();
410
411        // Make positions 5,6,7 identical to 0,1,2
412        owned[5].1 = vec![0.0];
413        owned[6].1 = vec![1.0];
414        owned[7].1 = vec![2.0];
415
416        let traj = as_refs(&owned);
417        let (profile, index) = matrix_profile(&traj, 3, 0.5).unwrap();
418
419        // The profile at index 0 should be very low (has a near-perfect match at 5)
420        assert!(
421            profile[0] < 0.01,
422            "identical subsequences should have ~0 profile, got {}",
423            profile[0]
424        );
425        assert_eq!(index[0], 5);
426    }
427
428    // ─── discover_motifs ────────────────────────────────────────
429
430    #[test]
431    fn motifs_planted_pattern() {
432        let owned = trajectory_with_planted_motif();
433        let traj = as_refs(&owned);
434
435        let motifs = discover_motifs(&traj, 5, 3, 0.5).unwrap();
436
437        assert!(!motifs.is_empty(), "should find at least one motif");
438
439        let best = &motifs[0];
440        assert!(
441            best.occurrences.len() >= 2,
442            "best motif should have at least 2 occurrences, got {}",
443            best.occurrences.len()
444        );
445
446        // The canonical should be at one of the planted positions (10, 40, 70)
447        let planted = [10usize, 40, 70];
448        let found_planted = best
449            .occurrences
450            .iter()
451            .any(|o| planted.iter().any(|&p| o.start_index.abs_diff(p) <= 2));
452        assert!(
453            found_planted,
454            "should find at least one planted position, got indices: {:?}",
455            best.occurrences
456                .iter()
457                .map(|o| o.start_index)
458                .collect::<Vec<_>>()
459        );
460    }
461
462    #[test]
463    fn motifs_period_detection() {
464        let owned = trajectory_with_planted_motif();
465        let traj = as_refs(&owned);
466
467        let motifs = discover_motifs(&traj, 5, 1, 0.5).unwrap();
468        assert!(!motifs.is_empty());
469
470        let best = &motifs[0];
471        if best.occurrences.len() >= 3 {
472            // If 3 occurrences found, period should be detected as ~30
473            if let Some(period) = best.period {
474                assert!(
475                    period.abs_diff(30) <= 6,
476                    "expected period ~30, got {period}"
477                );
478            }
479        }
480    }
481
482    #[test]
483    fn motifs_stationary_trajectory() {
484        // Constant trajectory — everything is a motif
485        let owned: Vec<(i64, Vec<f32>)> =
486            (0..30).map(|i| (i as i64 * 1000, vec![1.0, 2.0])).collect();
487        let traj = as_refs(&owned);
488
489        let motifs = discover_motifs(&traj, 5, 3, 0.5).unwrap();
490        assert!(!motifs.is_empty());
491        // Best motif should have very low match distance
492        assert!(
493            motifs[0].mean_match_distance < 0.01,
494            "constant trajectory should have ~0 match distance"
495        );
496    }
497
498    #[test]
499    fn motifs_max_motifs_respected() {
500        let owned: Vec<(i64, Vec<f32>)> = (0..50)
501            .map(|i| (i as i64 * 1000, vec![(i as f32).sin()]))
502            .collect();
503        let traj = as_refs(&owned);
504
505        let motifs = discover_motifs(&traj, 5, 2, 0.5).unwrap();
506        assert!(motifs.len() <= 2);
507    }
508
509    // ─── discover_discords ──────────────────────────────────────
510
511    #[test]
512    fn discords_planted_anomaly() {
513        let dim = 2;
514        let mut owned: Vec<(i64, Vec<f32>)> =
515            (0..50).map(|i| (i as i64 * 1000, vec![1.0; dim])).collect();
516
517        // Plant an anomaly at position 25
518        for step in 25..30 {
519            owned[step].1 = vec![100.0; dim];
520        }
521
522        let traj = as_refs(&owned);
523        let discords = discover_discords(&traj, 5, 3).unwrap();
524
525        assert!(!discords.is_empty(), "should detect planted anomaly");
526
527        // The top discord should be near position 25
528        let top = &discords[0];
529        assert!(
530            top.start_index.abs_diff(25) <= 5,
531            "top discord should be near position 25, got {}",
532            top.start_index
533        );
534        assert!(
535            top.nn_distance > 1.0,
536            "discord should have high nn_distance, got {}",
537            top.nn_distance
538        );
539    }
540
541    #[test]
542    fn discords_constant_trajectory() {
543        // Constant trajectory — no real discords
544        let owned: Vec<(i64, Vec<f32>)> = (0..30).map(|i| (i as i64 * 1000, vec![1.0])).collect();
545        let traj = as_refs(&owned);
546
547        let discords = discover_discords(&traj, 5, 3).unwrap();
548        // All nn_distances should be very low
549        for d in &discords {
550            assert!(
551                d.nn_distance < 0.01,
552                "constant trajectory should have ~0 discord distance"
553            );
554        }
555    }
556
557    #[test]
558    fn discords_max_discords_respected() {
559        let owned: Vec<(i64, Vec<f32>)> = (0..50)
560            .map(|i| (i as i64 * 1000, vec![(i as f32 * 0.1).sin()]))
561            .collect();
562        let traj = as_refs(&owned);
563
564        let discords = discover_discords(&traj, 5, 1).unwrap();
565        assert!(discords.len() <= 1);
566    }
567
568    // ─── detect_period ──────────────────────────────────────────
569
570    #[test]
571    fn period_regular_spacing() {
572        let occs = vec![
573            MotifOccurrence {
574                start_index: 10,
575                timestamp: 10000,
576                distance: 0.0,
577            },
578            MotifOccurrence {
579                start_index: 30,
580                timestamp: 30000,
581                distance: 0.1,
582            },
583            MotifOccurrence {
584                start_index: 50,
585                timestamp: 50000,
586                distance: 0.1,
587            },
588            MotifOccurrence {
589                start_index: 70,
590                timestamp: 70000,
591                distance: 0.05,
592            },
593        ];
594        let period = detect_period(&occs);
595        assert_eq!(period, Some(20));
596    }
597
598    #[test]
599    fn period_irregular_spacing() {
600        let occs = vec![
601            MotifOccurrence {
602                start_index: 5,
603                timestamp: 5000,
604                distance: 0.0,
605            },
606            MotifOccurrence {
607                start_index: 10,
608                timestamp: 10000,
609                distance: 0.1,
610            },
611            MotifOccurrence {
612                start_index: 50,
613                timestamp: 50000,
614                distance: 0.1,
615            },
616        ];
617        let period = detect_period(&occs);
618        assert!(period.is_none(), "irregular gaps should return None");
619    }
620
621    #[test]
622    fn period_too_few_occurrences() {
623        let occs = vec![
624            MotifOccurrence {
625                start_index: 0,
626                timestamp: 0,
627                distance: 0.0,
628            },
629            MotifOccurrence {
630                start_index: 10,
631                timestamp: 10000,
632                distance: 0.1,
633            },
634        ];
635        assert!(detect_period(&occs).is_none());
636    }
637
638    // ─── subsequence_distance ───────────────────────────────────
639
640    #[test]
641    fn subseq_distance_identical() {
642        let owned: Vec<(i64, Vec<f32>)> = (0..10).map(|i| (i as i64, vec![1.0, 2.0])).collect();
643        let traj = as_refs(&owned);
644        let dist = subsequence_distance(&traj, 0, 5, 3);
645        assert!(
646            dist < 1e-6,
647            "identical subsequences should have ~0 distance"
648        );
649    }
650
651    #[test]
652    fn subseq_distance_different() {
653        let mut owned: Vec<(i64, Vec<f32>)> = (0..10).map(|i| (i as i64, vec![0.0])).collect();
654        // Make positions 5,6,7 = [10.0]
655        for i in 5..8 {
656            owned[i].1 = vec![10.0];
657        }
658        let traj = as_refs(&owned);
659        let dist = subsequence_distance(&traj, 0, 5, 3);
660        assert!(
661            dist > 1.0,
662            "different subsequences should have large distance"
663        );
664    }
665}