Sign up ×
Code Review Stack Exchange is a question and answer site for peer programmer code reviews. It's 100% free, no registration required.

I made a simple hash table and I was wondering if there was any way to increase the efficiency of search times. My table class is called Dictionary, because C# has tainted me.

The way I set this up is that Dictionary has an array of Buckets, a modified linked list whose Nodes store string keys as well as whatever other data.

I use a modular hash function to get the index of the array the Dictionary should access in, then search/insert with the Bucket at that index.

Dictionary class:

#ifndef _DICTIONARY_H_
#define _DICTIONARY_H_

#include <iostream>
#include <string>

#include "bucket.h"

using namespace std;

template <class T>
class Dictionary
{
private:

    Bucket<T>* mBuckets;

    // Number of buckets
    int mCount;

    int hash(string key)
    {
        // Modular hashing algorithm

        int value = 0;

        for (int i = 0; i < key.length(); i++)
            value += key[i];

        return value % mCount;
    }

public:
    Dictionary();
    Dictionary(int mCount);
    ~Dictionary();

    bool containsKey(string key);
    void display();
    int getCount();
    T getData(string key);
    bool insert(string key, T value);
    bool isEmpty();
};

// ********************************************************
//                Constructor / Destructor
// ********************************************************

template <class T>
Dictionary<T>::Dictionary()
{
    // Default mCount to 16
    this->mCount = 16;

    // Define mBins
    mBuckets = new Bucket<Item<T>>[mCount];
}

template <class T>
Dictionary<T>::Dictionary(int mCount)
{
    // Default mCount to 16 if invalid
    this->mCount = (mCount >= 0 ? mCount : 16);

    // Define mBins
    mBuckets = new Bucket<T>[mCount];
}

template <class T>
Dictionary<T>::~Dictionary()
{
    delete[] mBuckets;
}


// ********************************************************
//                       Functions
// ********************************************************

template <class T>
bool Dictionary<T>::containsKey(string key)
{
    int bin = hash(key);

    return mBuckets[bin].isExist(key);
}

template <class T>
void Dictionary<T>::display()
{
    cout
        << " Dictionary - Items:" << getCount() << "\n"
        << "*********************** \n";

    for (int i = 0; i < mCount; i++)
    {
        cout << left << i << ": ";

        mBuckets[i].display();

        cout << "\n\n";
    }
}

template <class T>
void Dictionary<T>::displayDistribution()
{
    cout
        << " Dictionary - Distribution:" << getCount() << "\n"
        << "*********************** \n";

    for (int i = 0; i < mCount; i++)
    {
        cout 
            << left 
            << i << ": " << mBuckets[i].getCount();

        cout << "\n";
    }
}

template <class T>
int Dictionary<T>::getCount()
{
    int count = 0;
    for (int i = 0; i < mCount; i++)
    {
        count += mBuckets[i].getCount();
    }

    return count;
}

template <class T>
T Dictionary<T>::getData(string key)
{
    int bin = hash(key);

    return mBuckets[bin].getData(key);
}

template <class T>
bool Dictionary<T>::insert(string key, T value)
{
    int bin = hash(key);

    mBuckets[bin].insert(key, value);

    return true;
}

template <class T>
bool Dictionary<T>::isEmpty()
{
    return getCount() == 0;
}

#endif

Bucket class:

#ifndef _BUCKET_H_
#define _BUCKET_H_

#include <iostream>

using namespace std;

template <class T>
class Bucket
{
private:
    template <class T>
    struct Node
    {
        string mKey;
        T mData;
        Node<T> *mNext, *mPrevious;

        Node()
        {
            mKey = "";
            mData = T();
            mNext = NULL;
            mPrevious = NULL;
        }

        Node(string key, T data)
        {
            mKey = key;
            mData = data;
            mNext = NULL;
            mPrevious = NULL;
        }
    };

    Node<T> *mHead, *mTail;
    int mCount;

public:
    Bucket();
    ~Bucket();


    int getCount();
    bool isEmpty();
    bool isExist(string searchKey);
    bool remove(string searchKey);

    void display();
    void insert(string key, T data);

    T getData(string key);
};

// ********************************************************
//                Constructor / Destructor
// ********************************************************

template <class T>
Bucket<T>::Bucket()
{
    mHead = NULL;
    mTail = NULL;
    mCount = 0;
}

template <class T>
Bucket<T>::~Bucket()
{
    Node<T> *tmp, *toBeDeleted;

    tmp = mHead;

    // removing node by node
    while (tmp != NULL)
    {
        toBeDeleted = tmp;
        tmp = tmp->mNext;
        toBeDeleted->mNext = NULL;

        delete toBeDeleted;
    }

    // reinitialize the pointers
    mHead = NULL;
    mTail = NULL;
    mCount = 0;
}


// ********************************************************
//                       Functions
// ********************************************************

template <class T>
int Bucket<T>::getCount()
{
    return mCount;
}


template <class T>
bool Bucket<T>::isEmpty()
{
    return mCount == 0;
}


template <class T>
bool Bucket<T>::isExist(string searchKey)
{
    Node<T> *tmp = mHead;

    while (tmp != NULL)
    {
        if (tmp->mKey == searchKey)
            return true;

        tmp = tmp->mNext;
    }

    return false;
}


template <class T>
bool Bucket<T>::remove(string searchKey)
{
    Node<T> *tmp, *prev;

    if (mHead == NULL)
        return false;
    else if (searchKey < mHead->mKey || searchKey > mTail->mKey)
        return false;

    tmp = mHead;
    prev = NULL;

    for (int i = 0; i < mCount; i++)
    {
        if (searchKey == tmp->mKey)
            break;

        prev = tmp;
        tmp = tmp->mNext;
    }

    if (tmp != NULL)
    {
        if (tmp == mHead)
        {
            tmp = mHead;

            mHead = mHead->mNext;
            if (mHead == NULL)
                mTail = NULL;

            tmp->mNext = NULL;
        }
        else if (tmp == mTail)
        {
            prev->mNext = NULL;
            mTail = prev;
        }
        else
        {
            prev->mNext = tmp->mNext;
            tmp->mNext = NULL;
        }

        delete tmp;
        mCount--;

        return true;
    }

    return false;
}


template <class T>
void Bucket<T>::display()
{
    Node<T> *tmp;

    if (mHead == NULL)
    {
        cout << "{ }\n";
        return;
    }

    cout << "{ ";
    tmp = mHead;
    while (tmp != NULL)
    {
        cout
            << "["
            << tmp->mKey
            << ", "
            << tmp->mData
            << "]"
            << (tmp != mTail ? ", " : " }");

        tmp = tmp->mNext;
    }
    cout << "\n";
}


template <class T>
void Bucket<T>::insert(string key, T data)
{
    Node<T> *tmp, *oneBefore, *newNode;

    newNode = new Node<T>(key, data);
    if (newNode == NULL)
        return;

    if (mHead == NULL)
    {
        mHead = newNode;
        mTail = newNode;
    }
    else
    {
        if (key < mHead->mKey) // Put at head
        {
            newNode->mNext = mHead;

            newNode->mPrevious = NULL;

            mHead = newNode;
        }
        else if (key > mTail->mKey) // Put at tail
        {
            mTail->mNext = newNode;

            newNode->mPrevious = mTail;

            mTail = newNode;
        }
        else if (key == mHead->mKey || key == mTail->mKey) // Dont insert if already added
        {
            delete newNode;
            return;
        }
        else
        {
            tmp = mHead;
            oneBefore = mHead;

            // Iterate through list to find position
            while (tmp->mKey < key)
            {
                oneBefore = tmp;

                tmp = tmp->mNext;
            }

            if (tmp->mKey != key)
            {
                newNode->mNext = tmp;
                tmp->mPrevious = newNode;

                oneBefore->mNext = newNode;
                newNode->mPrevious = oneBefore;
            }
            else
            {
                delete newNode;
                return;
            }
        }
    }

    mCount++;
}


template <class T>
T Bucket<T>::getData(string key)
{
    Node<T> *tmp = mHead;

    while (tmp != NULL)
    {
        if (tmp->mKey == key)
            return tmp->mData;

        tmp = tmp->mNext;
    }

    return T();
}

#endif

I've been testing the run times of my search function by loading pairs from a file with 398484 unique entries, all keys in alphabetical order. Here's a link to the file.

I have a few other concerns, mainly, IS this a correct hash table implementation? Sounds weird, but I've never made a hash table before, so what I know about them is derived from many a Stack Overflow and Google link.

The way it's currently implemented, my buckets can be however large they want, and the number of items per bucket averages out at around (numItems/numBuckets).

The results of searching for keys at the beginning, middle, and end of the file:

Search Key: a
Search key found.

 Search time: 2.82251898e-05 seconds.

Key: a

Value: 1411153


Search Key: mesophyls
Search key found.

 Search time: 0.001279370876 seconds.

Key: mesophyls

Value: 1418758


Search Key: zyzzyvas
Search key found.

 Search time: 0.001327610291 seconds.

Key: zyzzyvas

Value: 2223394
share|improve this question

2 Answers 2

First and foremost

there is std::unordered_map. If you just need a hash map, use it.

What's wrong

  1. Your hash is an int, if your string consists of negative chars or overflows, you will get a negative value at the end. In C/C++ the modulo operator (%) returns a negative value for a negative input.

  2. How does that compile? What's Item?

    mBuckets = new Bucket<Item<T>>[mCount];

Performance

  1. Your code doesn't support rebalance. That is quite essential if you want to provide generic performance. See std::unordered_map::rehash.

  2. Profile your code to find out where time is spent.

  3. Avoid unnecessary copies when taking generic arguments as parameters and returning them. In general take by const& and return (const)&. This applies both for the std::string parameters as well as for the generic type T. If this is really heavyly used library code, consider using perfect forwarding.

Points for improvement

  1. Do not use using std. Especially not in library code.
  2. You don't provide const interface. You cant even ask isEmpty on a const Dictionary
  3. Your hash function is bad: Just adding the values does not work well. "foo" has the same hash as "oof".
  4. Implement the standard operators following the expected semantic e.g. operator[] for element access.
  5. Use nullptr instead of NULL.
  6. Use std::unique_ptr<> for owning pointers instead of managing memory manually.
  7. Use initializer lists instead of initializing members in constructors.
  8. Don't implement all aspects of a constructor twice, rely on one from the other if possible
  9. Simply returning when your new returns NULL is a VERY BAD IDEA.
  10. Provide begin()/end() for iterators.
  11. Don't implement output within your template library code.
  12. Why use double linked Nodes? The whole point of the hashtable is that there should never be an excess number of Nodes in one Bucket anyway. The Bucket<T>::insert is incredibly difficult to read.
  13. Use algorithms instead of raw loops.
  14. Your manual new / delete will probably start leak as soon as exceptions happen.

Due to the amount of points I cannot go into details with each and everyone. Most of the points are standard items that should be easily searchable. Please ask if you have a specific question.

share|improve this answer
    
Thanks for looking over what I have. I'll make edits based on what you noted. Also I do know about unordered_map, I like making stuff like this for the learning aspect haha. – Gurman8r yesterday
    
Good start, you got most. Added some points too. – Deduplicator 23 hours ago
  1. Consider following the conventions of the standard library for member-names, so you can use generic algorithms with your class:
    size for getCount, empty for isEmpty, contains for containsKey, at/[] for getData.

  2. As Node is in Bucket, there's no sense in templating that too unless you need additional template-parameter (you don't).

  3. Consider killing the constructors of Node to make it an aggregate and then use aggregate-initialization.

  4. Looks like display and displayDistribution are pure debugging-helpers.
    Otherwise, you should make them more generically useful, by adjusting their output, and getting the output-stream as an argument (optionally defaulted to std::cout if not provided).
    If one of them is good enough, set it as the free (if neccessary friend) function template <class T> std::ostream& operator<<(std::ostream&, const Dictionary<T>&).

  5. As in 4 also for Bucket.

  6. Bucket seems to be an implementation-detail of Dictionary.
    If that's right, handle it such and keep it out of the public eye, probably by making it a member.

share|improve this answer

Your Answer

 
discard

By posting your answer, you agree to the privacy policy and terms of service.

Not the answer you're looking for? Browse other questions tagged or ask your own question.