I'm a scientific C programmer moving my way over to using Modern C++. I found myself needing a HyperLogLog implementation, and I wanted to use this for practice.
I plan to move a number of these functions over into an accompanying header file, but I do find it convenient at the moment to only need to include it (esp. in unit tests), and it was easier to post this to CodeReview as one chunk anyhow.
I know I'm not quite writing idiomatic C++, and that's why I've posted this here. I want to improve my code to be more modern, as well as just good code.
It does seem to work -- I have unit tests for random integers, all integers in given ranges (e.g., 1-100000), and real world examples, covering addition and set difference operations.
However, I'm particularly unsure about my copy and move constructors and my assignment operators. I have C++17 at my disposal, but I am currently only using features though C++14.
#ifndef _HLL_H_
#define _HLL_H_
#include <cstdlib>
#include <cstdio>
#include <cmath>
#include <cstdint>
#include <cstring>
#include <cinttypes>
#include <algorithm>
#include <vector>
#include "logutil.h" // for LOG_DEBUG
namespace mystuff {
constexpr double make_alpha(size_t m) {
switch(m) {
case 16: return .673;
case 32: return .697;
case 64: return .709;
default: return 0.7213 / (1 + 1.079/m);
}
}
class hll_t {
// HyperLogLog implementation.
// To make it general, the actual point of entry is a 64-bit integer hash function.
// Therefore, you have to perform a hash function to convert various types into a suitable query.
// We could also cut our memory requirements by switching to only using 6 bits per element,
// (up to 64 leading zeros), though the gains would be relatively small
// given how memory-efficient this structure is.
// Attributes
size_t np_;
size_t m_;
double alpha_;
double relative_error_;
std::vector<uint8_t> core_;
double sum_;
int is_calculated_;
public:
// Constructor
hll_t(size_t np=20): np_(np), m_(1 << np), alpha_(make_alpha(m_)),
relative_error_(1.03896 / std::sqrt(m_)),
core_(m_, 0),
sum_(0.), is_calculated_(0) {}
// Call sum to recalculate if you have changed contents.
void sum() {
sum_ = 0;
for(unsigned i(0); i < m_; ++i) sum_ += 1. / (1 << core_[i]);
is_calculated_ = 1;
}
// Returns cardinality estimate. Sums if not calculated yet.
double report() {
if(!is_calculated_) sum();
const double ret(alpha_ * m_ * m_ / sum_);
// Correct for small values
if(ret < m_ * 2.5) {
int t(0);
for(unsigned i(0); i < m_; ++i) t += (core_[i] == 0);
if(t) return m_ * std::log((double)(m_) / t);
}
return ret;
// We don't correct for too large just yet, but we should soon.
}
// Returns the size of a symmetric set difference.
double operator^(hll_t &other) {
hll_t tmp(*this);
tmp += other;
tmp.sum();
return report() + other.report() - tmp.report();
}
// Returns error estimate
double est_err() {
return relative_error_ * report();
}
void add(uint64_t hashval) {
const uint32_t index = hashval >> (64 - np_);
const uint32_t lzt(__builtin_clzll(hashval << np_) + 1);
if(core_[index] < lzt) core_[index] = lzt;
}
std::string to_string() {
return std::to_string(report()) + ", +- " + std::to_string(est_err());
}
// Reset.
void clear() {
std::fill(core_.begin(), core_.end(), 0u);
sum_ = is_calculated_ = 0;
}
// Assignment Operators
hll_t &operator=(hll_t &other) {
m_ = other.m_;
np_ = other.np_;
core_ = std::move(other.core_);
alpha_ = other.alpha_;
sum_ = other.sum_;
relative_error_ = other.relative_error_;
m_ = other.m_;
return *this;
}
hll_t &operator=(const hll_t &other) {
m_ = other.m_;
np_ = other.np_;
core_ = other.core_;
alpha_ = other.alpha_;
sum_ = other.sum_;
relative_error_ = other.relative_error_;
m_ = other.m_;
return *this;
}
hll_t(const hll_t &other): hll_t(other.m_) {
*this = other;
}
hll_t(hll_t &&other):
np_(other.np_),
m_(other.m_),
alpha_(other.alpha_),
relative_error_(other.relative_error_),
core_(std::move(other.core_)),
sum_(other.sum_),
is_calculated_(other.is_calculated_) {
}
hll_t const &operator+=(const hll_t &other) {
if(other.np_ != np_)
LOG_EXIT("np_ (%zu) != other.np_ (%zu)\n", np_, other.np_);
// If we ever find this to be expensive, this could be trivially implemented with SIMD.
for(unsigned i(0); i < m_; ++i) core_[i] |= other.core_[i];
return *this;
}
hll_t operator+(const hll_t &other) const {
if(other.np_ != np_)
LOG_EXIT("np_ (%zu) != other.np_ (%zu)\n", np_, other.np_);
hll_t ret(*this);
return ret += other;
}
// Clears, allows reuse with different np.
void resize(size_t new_size) {
new_size = roundup64(new_size);
LOG_DEBUG("Resizing to %zu, with np = %zu\n", new_size, (size_t)std::log2(new_size));
clear();
core_.resize(new_size);
}
// Getter for is_calculated_
bool is_ready() {
return is_calculated_;
}
};
} // namespace mystuff
Based on the feedback in a comment, here is my second attempt at the move and copy assignment and constructors:
// Assignment Operators
hll_t &operator=(const hll_t &other) {
m_ = other.m_;
np_ = other.np_;
core_ = other.core_;
alpha_ = other.alpha_;
sum_ = other.sum_;
relative_error_ = other.relative_error_;
m_ = other.m_;
return *this;
}
hll_t &operator=(hll_t &&other) {
np_ = other.np_;
m_ = other.m_;
alpha_ = other.alpha_;
relative_error_ = other.relative_error_;
core_ = other.core_;
is_calculated_ = other.is_calculated_;
sum_ = other.sum_;
return *this;
}
hll_t(const hll_t &other): hll_t(other.m_) {
*this = other;
}
hll_t(hll_t &&other):
np_(other.np_),
m_(other.m_),
alpha_(other.alpha_),
relative_error_(other.relative_error_),
core_(std::move(other.core_)),
sum_(other.sum_),
is_calculated_(other.is_calculated_) {
}
hll_t &operator=(hll_t &other)
usually is a copy assignment operator which you implemented further below with theconst
argument. The move assignment operator should behll_t &operator=(hll_t &&other)
. \$\endgroup\$