Number Theoretic Transform (NTT)
(weilycoder/poly/ntt.hpp)

View this file on GitHub
Last update: 2025-11-07 09:14:02+08:00
Include: #include "weilycoder/poly/ntt.hpp"

Depends on

Required by

weilycoder/poly/ntt_convolve.hpp

Verified with

test/convolution_mod.test.cpp

Code

#ifndef WEILYCODER_POLY_NTT_HPP
#define WEILYCODER_POLY_NTT_HPP

#include "../number-theory/mod_utility.hpp"
#include "../number-theory/primitive_root.hpp"
#include "fft_utility.hpp"
#include <cstdint>
#include <vector>

namespace weilycoder {
/**
 * @brief Number Theoretic Transform (NTT)
 * @tparam mod The prime modulus
 * @tparam inverse Whether to perform the inverse NTT
 * @tparam bit32 Whether to use 32-bit modular multiplication
 * @tparam root A primitive root modulo mod
 * @param y The input/output vector to be transformed
 */
template <uint64_t mod, bool inverse = false, bool bit32 = false,
          uint64_t root = prime_primitive_root<mod>()>
void ntt(std::vector<uint64_t> &y) {
  static_assert(is_prime(mod), "mod must be a prime");
  fft_change(y);
  size_t len = y.size();
  if (len == 0 || (len & (len - 1)) != 0)
    throw std::invalid_argument("Length of input vector must be a power of two");
  if ((mod - 1) % len != 0)
    throw std::invalid_argument(
        "mod - 1 must be divisible by the length of input vector");
  constexpr uint64_t g = inverse ? mod_pow<bit32>(root, mod - 2, mod) : root;
  for (size_t h = 2; h <= len; h <<= 1) {
    uint64_t wn = mod_pow<bit32>(g, (mod - 1) / h, mod);
    for (size_t j = 0; j < len; j += h) {
      uint64_t w = 1;
      for (size_t k = j; k < j + (h >> 1); ++k) {
        uint64_t u = y[k];
        uint64_t t = mod_mul<bit32>(w, y[k + (h >> 1)], mod);
        y[k] = mod_add<bit32>(u, t, mod);
        y[k + (h >> 1)] = mod_sub<bit32>(u, t, mod);
        w = mod_mul<bit32>(w, wn, mod);
      }
    }
  }
  if constexpr (inverse) {
    uint64_t inv_len = mod_pow<bit32>(len, mod - 2, mod);
    for (size_t i = 0; i < len; ++i)
      y[i] = mod_mul<bit32>(y[i], inv_len, mod);
  }
}

/**
 * @brief Number Theoretic Transform (NTT) using 32-bit modular multiplication
 * @tparam mod The prime modulus
 * @tparam inverse Whether to perform the inverse NTT
 * @tparam root A primitive root modulo mod
 * @param y The input/output vector to be transformed
 */
template <uint64_t mod, bool inverse = false, uint64_t root = prime_primitive_root(mod)>
void ntt_32(std::vector<uint64_t> &y) {
  ntt<mod, inverse, true, root>(y);
}
} // namespace weilycoder

#endif

#line 1 "weilycoder/poly/ntt.hpp"



#line 1 "weilycoder/number-theory/mod_utility.hpp"



/**
 * @file mod_utility.hpp
 * @brief Modular Arithmetic Utilities
 */

#include <cstdint>

namespace weilycoder {
using u128 = unsigned __int128;

template <bool bit32 = false>
constexpr uint64_t mod_add(uint64_t a, uint64_t b, uint64_t modulus) {
  if constexpr (bit32) {
    uint64_t res = a + b;
    if (res >= modulus)
      res -= modulus;
    return res;
  } else {
    u128 res = static_cast<u128>(a) + b;
    if (res >= modulus)
      res -= modulus;
    return res;
  }
}

template <bool bit32 = false>
constexpr uint64_t mod_sub(uint64_t a, uint64_t b, uint64_t modulus) {
  if constexpr (bit32) {
    uint64_t res = (a >= b) ? (a - b) : (modulus + a - b);
    return res;
  } else {
    u128 res = (a >= b) ? (a - b) : (static_cast<u128>(modulus) + a - b);
    return res;
  }
}

/**
 * @brief Perform modular multiplication for 64-bit integers.
 * @tparam bit32 If true, won't use 128-bit arithmetic. You should ensure that
 *         all inputs are small enough to avoid overflow (i.e. bit-32).
 * @param a The first multiplicand.
 * @param b The second multiplicand.
 * @param modulus The modulus.
 * @return (a * b) % modulus
 */
template <bool bit32 = false>
constexpr uint64_t mod_mul(uint64_t a, uint64_t b, uint64_t modulus) {
  if constexpr (bit32)
    return a * b % modulus;
  else
    return static_cast<u128>(a) * b % modulus;
}

/**
 * @brief Perform modular multiplication for 64-bit integers with a compile-time
 *        modulus.
 * @tparam Modulus The modulus.
 * @tparam bit32 If true, won't use 128-bit arithmetic. You should ensure that
 *         all inputs are small enough to avoid overflow (i.e. bit-32).
 * @param a The first multiplicand.
 * @param b The second multiplicand.
 * @return (a * b) % Modulus
 */
template <uint64_t Modulus, bool bit32 = false>
constexpr uint64_t mod_mul(uint64_t a, uint64_t b) {
  if constexpr (bit32)
    return a * b % Modulus;
  else
    return static_cast<u128>(a) * b % Modulus;
}

/**
 * @brief Perform modular exponentiation for 64-bit integers.
 * @tparam bit32 If true, won't use 128-bit arithmetic. You should ensure that
 *         all inputs are small enough to avoid overflow (i.e. bit-32).
 * @param base The base number.
 * @param exponent The exponent.
 * @param modulus The modulus.
 * @return (base^exponent) % modulus
 */
template <bool bit32 = false>
constexpr uint64_t mod_pow(uint64_t base, uint64_t exponent, uint64_t modulus) {
  uint64_t result = 1 % modulus;
  base %= modulus;
  while (exponent > 0) {
    if (exponent & 1)
      result = mod_mul<bit32>(result, base, modulus);
    base = mod_mul<bit32>(base, base, modulus);
    exponent >>= 1;
  }
  return result;
}
} // namespace weilycoder


#line 1 "weilycoder/number-theory/primitive_root.hpp"



/**
 * @file primitive_root.hpp
 * @brief Functions to find primitive roots modulo a prime
 */

#line 1 "weilycoder/number-theory/factorize.hpp"



/**
 * @file factorize.hpp
 * @brief Functions for factorizing numbers using Pollard's Rho algorithm
 */

#line 1 "weilycoder/const_rand.hpp"



#line 5 "weilycoder/const_rand.hpp"

namespace weilycoder {
/**
 * @brief Linear Congruential Generator (LCG) to produce pseudo-random numbers
 *        at compile-time.
 * @tparam a The multiplier.
 * @tparam c The increment.
 * @tparam m The modulus.
 * @param state The current state of the generator.
 * @return The next state of the generator.
 */
template <uint32_t a, uint32_t c, uint64_t m>
constexpr uint32_t &lcg_next(uint32_t &state) {
  state = (static_cast<uint64_t>(a) * state + c) % m;
  return state;
}

/**
 * @brief LCG using parameters from "Minimal Standard" by Park and Miller.
 * @param state The current state of the generator.
 * @return The next state of the generator.
 */
constexpr uint32_t &lcg_minstd(uint32_t &state) {
  return lcg_next<48271, 0, 2147483647>(state);
}

/**
 * @brief LCG using parameters from "minstd_rand0" by Park and Miller.
 * @param state The current state of the generator.
 * @return The next state of the generator.
 */
constexpr uint32_t &lcg_minstd_rand0(uint32_t &state) {
  return lcg_next<16807, 0, 2147483647>(state);
}

/**
 * @brief LCG using parameters from "Numerical Recipes".
 * @param state The current state of the generator.
 * @return The next state of the generator.
 */
constexpr uint32_t &lcg_nr(uint32_t &state) {
  return lcg_next<1103515245, 12345, 4294967296>(state);
}
} // namespace weilycoder


#line 1 "weilycoder/number-theory/prime.hpp"



/**
 * @file prime.hpp
 * @brief Prime Number Utilities
 */

#line 11 "weilycoder/number-theory/prime.hpp"
#include <type_traits>

namespace weilycoder {
/**
 * @brief Miller-Rabin primality test for a given base.
 * @tparam bit32 If true, won't use 128-bit arithmetic. You should ensure that
 *         all inputs are small enough to avoid overflow (i.e. bit-32).
 * @tparam base The base to test.
 * @param n The number to test for primality.
 * @param d An odd component of n-1 (n-1 = d * 2^s).
 * @param s The exponent of 2 in the factorization of n-1.
 * @return true if n is probably prime for the given base, false if composite.
 */
template <bool bit32, uint64_t base>
constexpr bool miller_rabin_test(uint64_t n, uint64_t d, uint32_t s) {
  uint64_t x = mod_pow<bit32>(base, d, n);
  if (x == 0 || x == 1 || x == n - 1)
    return true;
  for (uint32_t r = 1; r < s; ++r) {
    x = mod_mul<bit32>(x, x, n);
    if (x == n - 1)
      return true;
  }
  return false;
}

/**
 * @brief Variadic template to test multiple bases in Miller-Rabin test.
 * @tparam bit32 If true, won't use 128-bit arithmetic. You should ensure that
 *         all inputs are small enough to avoid overflow (i.e. bit-32).
 * @tparam base The first base to test.
 * @tparam Rest The remaining bases to test.
 * @param n The number to test for primality.
 * @param d An odd component of n-1 (n-1 = d * 2^s).
 * @param s The exponent of 2 in the factorization of n-1.
 * @return true if n is probably prime for all given bases, false if composite.
 */
template <bool bit32, uint64_t base, uint64_t... Rest>
constexpr std::enable_if_t<(sizeof...(Rest) != 0), bool>
miller_rabin_test(uint64_t n, uint64_t d, uint32_t s) {
  return miller_rabin_test<bit32, base>(n, d, s) &&
         miller_rabin_test<bit32, Rest...>(n, d, s);
}

/**
 * @brief Miller-Rabin primality test using multiple bases.
 * @tparam bit32 If true, won't use 128-bit arithmetic. You should ensure that
 *         all inputs are small enough to avoid overflow (i.e. bit-32).
 * @tparam bases The bases to test.
 * @param n The number to test for primality.
 * @return true if n is probably prime, false if composite.
 */
template <bool bit32, uint64_t... bases> constexpr bool miller_rabin(uint64_t n) {
  if (n < 2)
    return false;
  if (n == 2 || n == 3)
    return true;
  if (n % 2 == 0)
    return false;

  uint64_t d = n - 1, s = 0;
  for (; d % 2 == 0; d /= 2)
    ++s;

  return miller_rabin_test<bit32, bases...>(n, d, s);
}

/**
 * @brief Miller-Rabin primality test optimized for 64-bit integers.
 *        Uses a fixed set of bases that guarantee correctness
 *        for 64-bit integers.
 * @param n The number to test for primality.
 * @return true if n is prime, false if not prime.
 */
constexpr bool miller_rabin64(uint64_t n) {
  return miller_rabin<false, 2, 325, 9375, 28178, 450775, 9780504, 1795265022>(n);
}

/**
 * @brief Miller-Rabin primality test optimized for 32-bit integers.
 *        Uses a fixed set of bases that guarantee correctness
 *        for 32-bit integers.
 * @param n The number to test for primality.
 * @return true if n is prime, false if not prime.
 */
constexpr bool miller_rabin32(uint32_t n) { return miller_rabin<true, 2, 7, 61>(n); }

constexpr bool is_prime(uint64_t n) {
  if (n <= UINT32_MAX)
    return miller_rabin32(static_cast<uint32_t>(n));
  return miller_rabin64(n);
}

/**
 * @brief Compile-time primality test for a given integer.
 * @tparam x The integer to test for primality.
 * @return true if x is prime, false otherwise.
 */
template <uint64_t x> constexpr bool is_prime() { return is_prime(x); }
} // namespace weilycoder


#line 12 "weilycoder/number-theory/factorize.hpp"
#include <algorithm>
#include <array>
#line 15 "weilycoder/number-theory/factorize.hpp"
#include <numeric>
#include <random>
#include <utility>
#include <vector>

namespace weilycoder {
/**
 * @brief Pollard's Rho algorithm to find a non-trivial factor of x
 * @tparam bit32 Whether to use 32-bit modular multiplication
 * @param x The number to factorize
 * @param c The constant in the polynomial x^2 + c
 * @return A non-trivial factor of x
 */
template <bool bit32 = false> constexpr uint64_t pollard_rho(uint64_t x, uint64_t c) {
  if (x % 2 == 0)
    return 2;
  c = c % (x - 1) + 1;
  uint32_t step = 0, goal = 1;
  uint64_t s = 0, t = 0;
  uint64_t value = 1;
  for (;; goal <<= 1, s = t, value = 1) {
    for (step = 1; step <= goal; ++step) {
      t = mod_mul<bit32>(t, t, x) + c;
      if (t >= x)
        t -= x;
      uint64_t diff = (s >= t ? s - t : t - s);
      value = mod_mul<bit32>(value, diff, x);
      if (step % 127 == 0) {
        uint64_t d = std::gcd(value, x);
        if (d > 1)
          return d;
      }
    }
    uint64_t d = std::gcd(value, x);
    if (d > 1)
      return d;
  }
  return x;
}

/**
 * @brief Pollard's Rho algorithm to find a non-trivial factor of x
 * @tparam bit32 Whether to use 32-bit modular multiplication
 * @param x The number to factorize
 * @return A non-trivial factor of x
 */
template <bool bit32 = false> uint64_t pollard_rho(uint64_t x) {
  if (x % 2 == 0)
    return 2;
  static std::minstd_rand rng{};
  return pollard_rho<bit32>(x, rng());
}

/**
 * @brief Factorize a number into its prime factors
 * @tparam bit32 Whether to use 32-bit modular multiplication
 * @param x The number to factorize
 * @return A vector of prime factors of x
 */
template <bool bit32 = false> std::vector<uint64_t> factorize(uint64_t x) {
  std::vector<uint64_t> factors;
  std::vector<std::pair<uint64_t, size_t>> stk;
  stk.emplace_back(x, 1);
  while (!stk.empty()) {
    auto [cur, cnt] = stk.back();
    stk.pop_back();
    if (cur == 1)
      continue;
    if (is_prime(cur)) {
      factors.resize(factors.size() + cnt, cur);
      continue;
    }
    uint64_t factor = cur;
    do
      factor = pollard_rho<bit32>(cur);
    while (factor == cur);
    size_t factor_count = 0;
    while (cur % factor == 0)
      cur /= factor, ++factor_count;
    stk.emplace_back(cur, cnt);
    stk.emplace_back(factor, factor_count * cnt);
  }
  std::sort(factors.begin(), factors.end());
  return factors;
}

/**
 * @brief Factorize a number into its prime factors with fixed-size array
 * @tparam N The size of the output array
 * @tparam bit32 Whether to use 32-bit modular multiplication
 * @param x The number to factorize
 * @return An array of prime factors of x
 */
template <size_t N = 64, bool bit32 = false>
constexpr std::array<uint64_t, N> factorize_fixed(uint64_t x) {
  uint32_t random_state = 1;
  size_t factor_idx = 0, stk_idx = 0;
  std::array<uint64_t, N> factors{};
  std::array<uint64_t, 64> stk_val{};
  std::array<size_t, 64> stk_cnt{};
  stk_val[stk_idx] = x, stk_cnt[stk_idx++] = 1;
  while (stk_idx > 0) {
    uint64_t cur = stk_val[--stk_idx];
    size_t cnt = stk_cnt[stk_idx];
    if (cur == 1)
      continue;
    if (is_prime(cur)) {
      for (size_t i = 0; i < cnt; ++i)
        factors[factor_idx++] = cur;
    } else {
      uint64_t factor = cur;
      do
        factor = pollard_rho<bit32>(cur, lcg_nr(random_state));
      while (factor == cur);
      size_t factor_count = 0;
      while (cur % factor == 0)
        cur /= factor, ++factor_count;
      stk_val[stk_idx] = cur, stk_cnt[stk_idx++] = cnt;
      stk_val[stk_idx] = factor, stk_cnt[stk_idx++] = factor_count * cnt;
    }
  }
  for (size_t i = 1; i < factor_idx; ++i) {
    uint64_t key = factors[i];
    size_t j = i;
    while (j > 0 && factors[j - 1] > key) {
      factors[j] = factors[j - 1];
      --j;
    }
    factors[j] = key;
  }
  return factors;
}
} // namespace weilycoder


#line 14 "weilycoder/number-theory/primitive_root.hpp"

namespace weilycoder {
/**
 * @brief Check if g is a primitive root modulo p
 * @tparam N The size of the factors array
 * @tparam bit32 Whether to use 32-bit modular multiplication
 * @param g The candidate primitive root
 * @param p The prime modulus
 * @param factors The prime factors of p - 1
 * @return true if g is a primitive root modulo p, false otherwise
 */
template <size_t N = 64, bool bit32 = false>
constexpr bool is_primitive_root(uint64_t g, uint64_t p,
                                 const std::array<uint64_t, N> &factors) {
  for (size_t i = 0; i < N; ++i) {
    uint64_t q = factors[i];
    if (q == 0)
      break;
    if (mod_pow<bit32>(g, (p - 1) / q, p) == 1)
      return false;
  }
  return true;
}

/**
 * @brief Check if g is a primitive root modulo p
 * @tparam bit32 Whether to use 32-bit modular multiplication
 * @param g The candidate primitive root
 * @param p The prime modulus
 * @param factors The prime factors of p - 1
 * @return true if g is a primitive root modulo p, false otherwise
 */
template <bool bit32 = false>
bool is_primitive_root(uint64_t g, uint64_t p, const std::vector<uint64_t> &factors) {
  const size_t N = factors.size();
  for (size_t i = 0; i < N; ++i) {
    uint64_t q = factors[i];
    if (q == 0)
      break;
    if (mod_pow<bit32>(g, (p - 1) / q, p) == 1)
      return false;
  }
  return true;
}

/**
 * @brief Find a primitive root modulo a prime p
 * @tparam bit32 Whether to use 32-bit modular multiplication
 * @param p The prime modulus
 * @return A primitive root modulo p, or 0 if p is not prime
 */
template <bool bit32 = false, size_t N = 64>
constexpr uint64_t prime_primitive_root(uint64_t p) {
  if (p == 2)
    return 1;
  if (!is_prime(p))
    return 0;
  auto factors = factorize_fixed<N, bit32>(p - 1);
  auto factors_set = std::array<uint64_t, N>{};
  size_t factor_count = 0;
  for (size_t i = 0; i < N; ++i) {
    uint64_t q = factors[i];
    if (q == 0)
      break;
    if (i == 0 || q != factors[i - 1])
      factors_set[factor_count++] = q;
  }
  for (uint64_t g = 2; g < p; ++g)
    if (is_primitive_root<N, bit32>(g, p, factors_set))
      return g;
  return 0;
}

/**
 * @brief Find a primitive root modulo a prime (compile-time version)
 * @tparam prime The prime modulus
 * @return A primitive root modulo prime.
 */
template <uint64_t prime> constexpr uint64_t prime_primitive_root() {
  if constexpr (prime == 2)
    return 1;
  if (prime < UINT32_MAX)
    return prime_primitive_root<true, 32>(prime);
  return prime_primitive_root<false, 64>(prime);
}
} // namespace weilycoder


#line 1 "weilycoder/poly/fft_utility.hpp"



#include <complex>
#include <cstddef>
#line 7 "weilycoder/poly/fft_utility.hpp"

/**
 * @file fft_utility.hpp
 * @brief Utility functions and constants for Fast Fourier Transform (FFT)
 */

namespace weilycoder {
/**
 * @brief Alias for the commonly used complex number type with double precision.
 */
using complex_t = std::complex<double>;

/**
 * @brief Compile-time constant for π (pi) parameterized by numeric type.
 * @tparam T Numeric type (e.g., float, double, long double).
 */
template <typename T> constexpr T PI = T(3.1415926535897932384626433832795l);

/**
 * @brief Perform in-place bit-reversal permutation (index reordering) required by
 *        iterative FFT.
 * @tparam T Element type stored in the vector. Must be Swappable and efficiently
 *         copyable.
 * @param a Vector to be permuted in-place. Its size should typically be a power
 *          of two for use with the FFT implementation in this file.
 */
template <typename T> void fft_change(std::vector<T> &a) {
  size_t n = a.size();
  std::vector<size_t> rev(n);
  for (size_t i = 0; i < n; ++i) {
    rev[i] = rev[i >> 1] >> 1;
    if (i & 1)
      rev[i] |= n >> 1;
    if (i < rev[i])
      std::swap(a[i], a[rev[i]]);
  }
}
} // namespace weilycoder


#line 9 "weilycoder/poly/ntt.hpp"

namespace weilycoder {
/**
 * @brief Number Theoretic Transform (NTT)
 * @tparam mod The prime modulus
 * @tparam inverse Whether to perform the inverse NTT
 * @tparam bit32 Whether to use 32-bit modular multiplication
 * @tparam root A primitive root modulo mod
 * @param y The input/output vector to be transformed
 */
template <uint64_t mod, bool inverse = false, bool bit32 = false,
          uint64_t root = prime_primitive_root<mod>()>
void ntt(std::vector<uint64_t> &y) {
  static_assert(is_prime(mod), "mod must be a prime");
  fft_change(y);
  size_t len = y.size();
  if (len == 0 || (len & (len - 1)) != 0)
    throw std::invalid_argument("Length of input vector must be a power of two");
  if ((mod - 1) % len != 0)
    throw std::invalid_argument(
        "mod - 1 must be divisible by the length of input vector");
  constexpr uint64_t g = inverse ? mod_pow<bit32>(root, mod - 2, mod) : root;
  for (size_t h = 2; h <= len; h <<= 1) {
    uint64_t wn = mod_pow<bit32>(g, (mod - 1) / h, mod);
    for (size_t j = 0; j < len; j += h) {
      uint64_t w = 1;
      for (size_t k = j; k < j + (h >> 1); ++k) {
        uint64_t u = y[k];
        uint64_t t = mod_mul<bit32>(w, y[k + (h >> 1)], mod);
        y[k] = mod_add<bit32>(u, t, mod);
        y[k + (h >> 1)] = mod_sub<bit32>(u, t, mod);
        w = mod_mul<bit32>(w, wn, mod);
      }
    }
  }
  if constexpr (inverse) {
    uint64_t inv_len = mod_pow<bit32>(len, mod - 2, mod);
    for (size_t i = 0; i < len; ++i)
      y[i] = mod_mul<bit32>(y[i], inv_len, mod);
  }
}

/**
 * @brief Number Theoretic Transform (NTT) using 32-bit modular multiplication
 * @tparam mod The prime modulus
 * @tparam inverse Whether to perform the inverse NTT
 * @tparam root A primitive root modulo mod
 * @param y The input/output vector to be transformed
 */
template <uint64_t mod, bool inverse = false, uint64_t root = prime_primitive_root(mod)>
void ntt_32(std::vector<uint64_t> &y) {
  ntt<mod, inverse, true, root>(y);
}
} // namespace weilycoder

Number Theoretic Transform (NTT) (weilycoder/poly/ntt.hpp)

Depends on

Required by

Verified with

Code

Number Theoretic Transform (NTT)
(weilycoder/poly/ntt.hpp)