21 x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1));
22 x = (((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2));
23 x = (((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4));
24 x = (((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8));
25 return (((x >> 16) | (x << 16))) >> (32 - bit_length);
30 return x && !(x & (x - 1));
37 const Fr& generator_start,
38 const Fr& generator_shift,
39 const size_t generator_size)
42 "generator_size must be divisible by num_threads to avoid silently skipping elements");
44 Fr thread_shift = generator_shift.pow(static_cast<uint64_t>(j * (generator_size / domain.num_threads)));
45 Fr work_generator = generator_start * thread_shift;
46 const size_t offset = j * (generator_size / domain.num_threads);
47 const size_t end = offset + (generator_size / domain.num_threads);
48 for (size_t i = offset; i < end; ++i) {
49 target[i] = coeffs[i] * work_generator;
50 work_generator *= generator_shift;
56 requires SupportsFFT<Fr>
60 BB_ASSERT(coeffs != target,
"fft_inner_parallel does not support in-place operation");
64 for (size_t i = (j * domain.thread_size); i < ((j + 1) * domain.thread_size); i += 2) {
65 uint32_t next_index_1 = (uint32_t)reverse_bits((uint32_t)i + 2, (uint32_t)domain.log2_size);
66 uint32_t next_index_2 = (uint32_t)reverse_bits((uint32_t)i + 3, (uint32_t)domain.log2_size);
67 __builtin_prefetch(&coeffs[next_index_1]);
68 __builtin_prefetch(&coeffs[next_index_2]);
70 uint32_t swap_index_1 = (uint32_t)reverse_bits((uint32_t)i, (uint32_t)domain.log2_size);
71 uint32_t swap_index_2 = (uint32_t)reverse_bits((uint32_t)i + 1, (uint32_t)domain.log2_size);
73 Fr::__copy(coeffs[swap_index_1], temp_1);
74 Fr::__copy(coeffs[swap_index_2], temp_2);
75 target[i + 1] = temp_1 - temp_2;
76 target[i] = temp_1 + temp_2;
81 for (
size_t m = 2; m < (domain.size); m <<= 1) {
92 const size_t start = j * (domain.thread_size >> 1);
93 const size_t end = (j + 1) * (domain.thread_size >> 1);
113 const size_t block_mask = m - 1;
123 const size_t index_mask = ~block_mask;
128 const Fr* round_roots = root_table[static_cast<size_t>(numeric::get_msb(m)) - 1];
133 for (size_t i = start; i < end; ++i) {
134 size_t k1 = (i & index_mask) << 1;
135 size_t j1 = i & block_mask;
136 temp = round_roots[j1] * target[k1 + j1 + m];
137 target[k1 + j1 + m] = target[k1 + j1] - temp;
138 target[k1 + j1] += temp;
144template <
typename Fr>
145 requires SupportsFFT<Fr>
151 const size_t start = j * domain.thread_size;
152 const size_t end = (j + 1) * domain.thread_size;
153 for (size_t i = start; i < end; ++i) {
154 target[i] *= domain.domain_inverse;
162 std::vector<Fr> evaluations(num_threads,
Fr::zero());
166 auto range = chunk.
range(n);
170 size_t start = *range.begin();
171 Fr z_acc = z.
pow(
static_cast<uint64_t
>(start));
172 for (
size_t i : range) {
173 Fr work_var = z_acc * coeffs[i];
180 for (
const auto& eval : evaluations) {
190 for (
size_t i = 0; i < n; ++i) {
212 for (
size_t i = 1; i < n; ++i) {
213 const Fr r = roots[i];
214 dest[i + 1] = dest[i];
215 for (
size_t k = i; k >= 1; --k) {
216 dest[k] = dest[k - 1] - r * dest[k];
218 dest[0] = -r * dest[0];
225 for (
size_t i = 0; i < n; ++i) {
226 result *= (z - roots[i]);
231template <
typename Fr>
271 for (
size_t i = 0; i < n; ++i) {
272 for (
size_t j = i + 1; j < n; ++j) {
274 "compute_efficient_interpolation requires distinct evaluation points");
278 std::vector<Fr> numerator_polynomial(n + 1);
279 polynomial_arithmetic::compute_linear_polynomial_product(evaluation_points, numerator_polynomial.
data(), n);
281 std::vector<Fr> roots_and_denominators(2 * n);
282 std::vector<Fr> temp_src(n);
283 for (
size_t i = 0; i < n; ++i) {
284 roots_and_denominators[i] = -evaluation_points[i];
285 temp_src[i] = src[i];
288 roots_and_denominators[n + i] = 1;
289 for (
size_t j = 0; j < n; ++j) {
293 roots_and_denominators[n + i] *= (evaluation_points[i] - evaluation_points[j]);
301 std::vector<Fr> temp_dest(n);
303 bool interpolation_domain_contains_zero =
false;
306 if (numerator_polynomial[0] ==
Fr(0)) {
307 for (
size_t i = 0; i < n; ++i) {
308 if (evaluation_points[i] ==
Fr(0)) {
310 interpolation_domain_contains_zero =
true;
316 if (!interpolation_domain_contains_zero) {
317 for (
size_t i = 0; i < n; ++i) {
319 z = roots_and_denominators[i];
321 multiplier = temp_src[i] * roots_and_denominators[n + i];
322 temp_dest[0] = multiplier * numerator_polynomial[0];
324 dest[0] += temp_dest[0];
325 for (
size_t j = 1; j < n; ++j) {
326 temp_dest[j] = multiplier * numerator_polynomial[j] - temp_dest[j - 1];
328 dest[j] += temp_dest[j];
332 for (
size_t i = 0; i < n; ++i) {
338 z = roots_and_denominators[i];
340 multiplier = temp_src[i] * roots_and_denominators[n + i];
342 temp_dest[1] = multiplier * numerator_polynomial[1];
346 dest[1] += temp_dest[1];
348 for (
size_t j = 2; j < n; ++j) {
349 temp_dest[j] = multiplier * numerator_polynomial[j] - temp_dest[j - 1];
351 dest[j] += temp_dest[j];
355 for (
size_t i = 0; i < n; ++i) {
356 dest[i] += temp_src[idx_zero] * roots_and_denominators[n + idx_zero] * numerator_polynomial[i + 1];
361template fr evaluate<fr>(
const fr*,
const fr&,
const size_t);
364template fr compute_sum<fr>(
const fr*,
const size_t);
365template void compute_linear_polynomial_product<fr>(
const fr*,
fr*,
const size_t);
366template void compute_efficient_interpolation<fr>(
const fr*,
fr*,
const fr*,
const size_t);
371template void compute_efficient_interpolation<grumpkin::fr>(
const grumpkin::fr*,
#define BB_ASSERT(expression,...)
#define BB_ASSERT_DEBUG(expression,...)
#define BB_ASSERT_EQ(actual, expected,...)
const std::vector< FF * > & get_inverse_round_roots() const
Fr compute_linear_polynomial_product_evaluation(const Fr *roots, const Fr z, const size_t n)
uint32_t reverse_bits(uint32_t x, uint32_t bit_length)
void ifft(Fr *coeffs, Fr *target, const EvaluationDomain< Fr > &domain)
void compute_linear_polynomial_product(const Fr *roots, Fr *dest, const size_t n)
Fr evaluate(const Fr *coeffs, const Fr &z, const size_t n)
bool is_power_of_two(uint64_t x)
void fft_inner_parallel(Fr *coeffs, Fr *target, const EvaluationDomain< Fr > &domain, const Fr &, const std::vector< Fr * > &root_table)
void compute_efficient_interpolation(const Fr *src, Fr *dest, const Fr *evaluation_points, const size_t n)
void scale_by_generator(Fr *coeffs, Fr *target, const EvaluationDomain< Fr > &domain, const Fr &generator_start, const Fr &generator_shift, const size_t generator_size)
Fr compute_sum(const Fr *src, const size_t n)
void parallel_for(size_t num_iterations, const std::function< void(size_t)> &func)
constexpr decltype(auto) get(::tuplet::tuple< T... > &&t) noexcept
auto range(size_t size, size_t offset=0) const
BB_INLINE constexpr field pow(const uint256_t &exponent) const noexcept
static void batch_invert(C &coeffs) noexcept
Batch invert a collection of field elements using Montgomery's trick.
static constexpr field zero()