Spelling Corrector.

Recently I was reading a blog . Gives excellent theory and working smallest code in python behind google Did you mean concept. Since I was doing more study on Bk-Tree, which also have potential to have spelling corrector. Here is my implementation for the spelling corrector. You can download a dictionary file from .

#include <stdio.h>
#include <string>
#include <vector>
#include <queue>
#include <fstream> 
#include <iostream> 
#include <sstream>
#include <iterator>
#include <algorithm>
#include <set>
#include "Timer.h"

class BkTree {
public:
	BkTree();
	~BkTree();
	void insert(std::string m_item);
	void get_nearest_words(std::string center, std::vector<std::string>& friends);
private:
	size_t EditDistance( const std::string &s, const std::string &t );
	struct Node {
		std::string m_item;
		size_t m_distToParent;
	Node *m_firstChild;
		Node *m_nextSibling;
		Node(std::string x, size_t dist);         
		~Node();
	};
	Node *m_root;
	int   m_size;
protected:
};

BkTree::BkTree() {
	m_root = NULL; 
	m_size = 0;
}

BkTree::~BkTree() { 
	if( m_root ) 
		delete m_root; 
}
BkTree::Node::Node(std::string x, size_t dist) {
	m_item         = x;
	m_distToParent = dist;
	m_firstChild   = m_nextSibling = NULL;
}

BkTree::Node::~Node() {
	if( m_firstChild ) 
		delete m_firstChild;
	if( m_nextSibling ) 
		delete m_nextSibling;
}
void BkTree::insert(std::string m_item) {
	if( !m_root ){
		m_size = 1;
		m_root = new Node(m_item, -1);
		return;
	}
	Node *t = m_root;
	while( true ) {
		size_t d = EditDistance( t->m_item, m_item );
		if( !d ) 
			return;
		Node *ch = t->m_firstChild;
		while( ch ) {
			if( ch->m_distToParent == d ) { 
				t = ch; 
				break; 
			}
			ch = ch->m_nextSibling;
		}
		if( !ch ) {
			Node *newChild = new Node(m_item, d);
			newChild->m_nextSibling = t->m_firstChild;
			t->m_firstChild = newChild;
			m_size++;
			break;
		}
	}
}
size_t BkTree::EditDistance( const std::string &left, const std::string &right ) {
	size_t asize = left.size();
	size_t bsize = right.size();
	std::vector<size_t> prevrow(bsize+1);
	std::vector<size_t> thisrow(bsize+1);

	for(size_t i = 0; i <= bsize; i++)
		prevrow[i] = i;
	for(size_t i = 1; i <= asize; i ++) {
		thisrow[0] = i;
		for(size_t j = 1; j <= bsize; j++) {
			thisrow[j] = std::min(prevrow[j-1] + size_t(left[i-1] != right[j-1]), 
				1 + std::min(prevrow[j],thisrow[j-1]) );			
		}
		std::swap(thisrow,prevrow);
	}
	return prevrow[bsize];
}
void BkTree::get_nearest_words(std::string center, std::vector<std::string>& flv) {
	if( !m_root ) return ;
	std::queue< Node* > q;
	q.push( m_root );
	while( !q.empty() ) {
		Node *t = q.front(); 
		q.pop();
		if ( !t ) continue;
		size_t d = EditDistance( t->m_item, center );
		if ( d == 0 ) {
			flv.clear();
			return;
		}
		if( d == 1 ) { 
			flv.push_back(t->m_item);
		}
		Node *ch = t->m_firstChild;
		q.push(ch);
		while( ch ) {
			if( ch->m_distToParent >=  1 )
				q.push(ch);
			ch = ch->m_nextSibling;
		}
	}
	std::sort( flv.begin(), flv.end());
	flv.erase( std::unique( flv.begin(), flv.end() ), flv.end() );
	return;
}
void BuildDictionary( BkTree *pDictionary) {
	Timer t("Time taken to build dictionary");
	std::ifstream dictFile("word.list");
	std::string line; 
	if (dictFile.is_open()) {
		while (! dictFile.eof() ) {               
			std::getline (dictFile,line);
			if ( line.size()) {
				pDictionary->insert(line);
			}
		}
		dictFile.close();
	}
	t.stop();
}
int main( int argc, char **argv ) {
	if (argc == 1) {
		return 0;
	}
	BkTree *pDictionary = new BkTree();
	BuildDictionary( pDictionary) ;
	Timer t("Time taken to search a word");
	std::vector<std::string>  flv;
	pDictionary->get_nearest_words(argv[1], flv);
	if ( flv.size() != 0 ) {
		std::cout << "Do you mean ..." << std::endl;
		for ( std::vector<std::string>::iterator itr = flv.begin(); itr != flv.end(); ++itr) {
			std::cout << *itr << std::endl;
		}
	}
	t.stop();
	delete pDictionary;
	return 0;
}

Here is the output.

  avinash@avinash-laptop:~/work/spellcheck$ ./breathalyzer speling
    Time taken to build dictionary
    -------------------------------
    User CPU Time  : 1.55 s
    System CPU Time: 0.64 s
    Wait Time      : 0.12 s
    -------------------------------
    Elapsed Time   : 2.31 s
 Do you mean ...
 seeling
 speeling
 speiling
 spelding
 spelling
 sperling
 spewing
 spieling
 spiling
   Time taken to search a word
   -------------------------------
   User CPU Time  : 2.37 s
   System CPU Time: 0.53 s
   Wait Time      : 0.04 s
   -------------------------------
   Elapsed Time   : 2.94 s
Advertisements

2 thoughts on “Spelling Corrector.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s