Cache Conscious programming.

Current processors have systems of cache that sits between the processors and memory. A Cache is a small high-speed memory device that operates on the assumption that recently accessed data is likely to be reference again in near future. Main memory which is RAM is contacted only when the data requested is not in the cache line. This process is called as cache-miss.

A Cache is not accessible to program and complete under the control of hardware. Although programs can be made better to be cache aware by improving spatial and temporal locality through regular predictable memory access pattern.

A Cache is a SRAM which is physically placed between processor and main memory. The goal of this memory is to keep the working set of data ( recently accessed data ) close to CPU. Working of cache is mainly depends upon the locality of reference. So the usefulness of this caches is mainly depends upon how your program access the cache and data. Temporal and Spatial are the two main components of the principle of locality.

Cache is divided into Layers. The first layer (L1) is located near the CPU core. It works as fast as CPU and has access latency of 1 to 3 CPU cycles, but the size of this cache is very limited not bigger than few KBs. The second layer ( L2 ) has little more space ( approx 512 MB to 4 MB ) but has access latency of around 10 CPU cycles. A third level of cache ( L3 ) can have capacity of 10’s of Megabytes but requires 10s of magnitude of CPU cycles, But all these 3 forms of Caches are still faster than Main memory.

Cache-Lines are the equal size blocks which divided your caches often between size of 64 and 256 bytes long. Cache-line size may vary between the CPU cache levels. Cache-lookup is the process which determine whether a cache-miss or cache-hit occurs. Cache-hit occurs when the requested data is found in one of the cache-lines. Cache miss occurs when the requested data is not found in the cache-lines and need to access further level of hierarchy.

To make your program more cache friendly, you need to improve on how your program is using the memory and improve its locality of access. Try to reduce the random access to the memory by using more predictable and regular memory access pattern. Try reducing or eliminating the user of pointers to exploit cache by using more regular and predictable memory access patterns. Pointers can cause two issues, first is the memory location they are pointing to is not known till they are reference. Second is they can cause you to access the Random memory ( SRAM ) which will result into poor usage of cache. It is yet to be proved that complete elimination of the pointers, but one can have techniques which will help to reduce the usage of pointers in such a way that resulting data structure is cache conscious.

Here is an example of cache conscious Binary search tree. ( see here for more details )

#include <malloc.h>
#include <string.h>

typedef unsigned long long  uint64_t;
typedef signed long int     int32_t;

#define PTR_SIZE 8
#define PTR_LEFT_RIGHT_SIZE 16

class cache_bst {
public:
    cache_bst();
    ~cache_bst();
    bool insert ( const char *);
    bool search ( const char *);
private:
    char  *m_buffer;
    size_t m_buffer_size;
    size_t m_offset;
    size_t _resize(size_t);
    static const int PAGE_SIZE = 10000000;
protected:
};
cache_bst::cache_bst() {
    m_buffer          = (char* )0;
    m_buffer_size     = 0;
    m_offset          = 0;
}
cache_bst::~cache_bst() {
    free(m_buffer);
}
size_t cache_bst::_resize(size_t length) {
    if ( m_offset == 0 ) {
        m_buffer_size = PAGE_SIZE;
        m_buffer    = (char*)malloc (m_buffer_size);
    } else {
        if(m_offset + length + PTR_LEFT_RIGHT_SIZE >= m_buffer_size) {
            char *tmp = (char*)malloc(m_buffer_size + PAGE_SIZE); 
            memcpy(tmp, m_buffer, m_buffer_size); 
            m_buffer_size += PAGE_SIZE;
            free(m_buffer);
            m_buffer = tmp;
        }
    }
    size_t rc = m_offset;
    m_offset += length + PTR_LEFT_RIGHT_SIZE;
    return rc;
}
bool cache_bst::insert ( const char *word) {
    char *root = m_buffer; 
    size_t offset = 0;
    char *key;
    if(m_buffer == (char *)0) {
        size_t len = strlen(word);
        offset     = _resize(len + 1 );
        key        = (char *)(m_buffer + PTR_LEFT_RIGHT_SIZE); 
        *(uint64_t *)(m_buffer) = *(uint64_t *)(m_buffer + PTR_SIZE) = 0;
        for(; *word != '\0'; word++, key++) {
            *key = *word;
        }
        *key = '\0';

        return true;
    } 
    while(1) {
        uint64_t *left = (uint64_t *)(root);
        uint64_t *right= (uint64_t *)(root + PTR_SIZE);
        char *key = (char *)(root + PTR_LEFT_RIGHT_SIZE);
        size_t rc = strcmp(word, key);
        if(rc == 0 ) {
            return false;
        } else if (rc < 0) {
            if( *left == 0) {
                rc    = _resize(strlen(word) + 1);
                left  = (uint64_t *)(m_buffer + offset);
                *left = rc;
                root  = (char *)(m_buffer + *left);
                left  = (uint64_t *)(root);
                right = (uint64_t *)(root + PTR_SIZE);
                key   = (char *)(root + PTR_LEFT_RIGHT_SIZE);
                *left = *right = 0;
                for(; *word != '\0'; word++, key++) {
                    *key = *word;
                }
                *key='\0';
                return true;
            }
            offset   = (size_t)*left;
            root     = (char *)(m_buffer + *left);
        } else {
            if( *right == 0) {
                size_t length = strlen(word);
                rc     =  _resize(length + 1);
                right  = (uint64_t *)(m_buffer + offset + PTR_SIZE);
                *right = rc;
                root   = (char *)(m_buffer + *right);
                left   = (uint64_t *)(root);
                right  = (uint64_t *)(root + PTR_SIZE);
                key    = (char *)(root + PTR_LEFT_RIGHT_SIZE);
                *left  = *right = 0; 
                for(; *word != '\0'; word++, key++){
                    *key=*word;
                }
                *key='\0';
                return true;
            }
            offset = (size_t)*right;
            root   = (char *)(m_buffer + *right);
        }
    }
    return true;
}
bool cache_bst::search( const char *word) {
    char *root = m_buffer;
    
    if(root==NULL) {
        return false;
    }
    while (1) {
        char * key = (root + PTR_LEFT_RIGHT_SIZE);  
        int32_t rc = strcmp(word, key);   

        uint64_t *left  = (uint64_t *)(root);  
        uint64_t *right = (uint64_t *)(root + PTR_SIZE);

        if ( rc == 0 ) {
            return true;
        } else if(rc  < 0) {
            if(*left == 0)  
                return false;
            root = (m_buffer + *left);
        } else {
            if(*right == 0)  
                return false;
            root = (m_buffer + *right);
        }
    } /* end of while (1)*/
    return false;
}

int main(int argc, char **argv) {
    cache_bst dict;
    dict.insert("A");
    dict.insert("B");
    dict.insert("C");
    dict.insert("D");
    dict.insert("E");
    dict.insert("F");
    dict.insert("G");
    dict.insert("H");

    bool found = dict.search("X");

    return 0;
}
Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s