Tweets

Anagram Signature Recognition and Match Lookup

This code implements an algorithm to recognize an anagram. A unique signature is computed based on the counts of upper and lower case letters using the assumption that no single character will occur more than 255 times. The signature is then used as the key to index a dictionary (std::map) of pre-loaded words to look up the anagrams of the target word.

The dictionary just loads a predetermined list of words

This interface assumes the STD library is acceptable for use.

Note that this is a console application [uses fprintf(stderr,...)] - the message mechanism could be easily abstracted but doing so would have violated the goal of simplicity, until such time as that abstraction is necessary.

Compatibility/System Requirements

This code was compiled with VC7 on Windows 2000 and Windows XP systems.

Code

 

 

// Anagram recognition and lookup
// Copyright (c) 2004, 2005 Robin Eric Fredericksen
// -- All rights reserved.

//==================================================
// The Interfaces
//==================================================

#include <map>

#include <string>
#include <vector>

using namespace
std;

//==================================================
class CAnagramSignature

{

public
:
// this can be changed in size if 255 instances of a single character are not enough
typedef unsigned char counter_type;
// assume that case counts. This covers both cases but costs twice the space.

enum
{

CHARACTER_COUNT = 26*2,
DWORD_COUNT = (CHARACTER_COUNT*sizeof(counter_type)
+
sizeof(DWORD) - 1)/sizeof(DWORD)
};



protected
:
// for map operator
friend bool operator<( const CAnagramSignature & _roLeft,
const
CAnagramSignature & _roRight);

union

{

// anonymous union may not be portable
counter_type m_factSignature[CHARACTER_COUNT];
DWORD m_fadwSignature[DWORD_COUNT];
}
m_oData;

public
:

CAnagramSignature(LPCTSTR _ptcString = NULL);
void
Signify(LPCTSTR _ptcString);

};



//==================================================
// must define this for using CAnagramSignature as a map<> key
//==================================================
bool operator<( const CAnagramSignature & _roLeft,
const
CAnagramSignature & _roRight);

//==================================================
class CAnagramDictionary
{

public
:

enum

{


MAX_STRING_LENGTH = 1024,
MAX_LINEREAD_BUFFER = MAX_STRING_LENGTH+1

};


protected
:

// for protecting access to map
CRITICAL_SECTION m_oCriticalSection;
// for storing our precomputed responses looked up by unique signature
map<CAnagramSignature, string> m_oSignatureMap;

float
m_fLoadTime;

public
:

CAnagramDictionary(LPCTSTR _ptcFileName=NULL);

virtual
~CAnagramDictionary();
bool
LoadWordList(LPCTSTR _ptcFileName);
string ThreadSafeLookup(const CAnagramSignature & _roSignature);

float
LoadTime(void) { return( m_fLoadTime ); }
};


//==================================================

class CWordList
{

public
:
enum

{

MAX_STRING_LENGTH = 1024,

MAX_LINEREAD_BUFFER = MAX_STRING_LENGTH+1
};


protected
:
// for protecting access to word list

CRITICAL_SECTION m_oCriticalSection;
// for storing our strings
vector<string> m_oWordList;
float
m_fLoadTime;

public
:
CWordList(LPCTSTR _ptcFileName=NULL);
virtual
~CWordList();

bool
LoadWordList(LPCTSTR _ptcFileName);
string ThreadSafeLookup(DWORD _dwIndex);
float
LoadTime(void) { return( m_fLoadTime ); }

};


//==================================================
// The Implementation
//==================================================

#include "Anagrams.h"
#include "ThreadTools.h"
#include <sys/timeb.h>

//==================================================
CAnagramSignature::CAnagramSignature(LPCTSTR _ptcString)
{

Signify(_ptcString);
}


//==================================================
void CAnagramSignature::Signify(LPCTSTR _ptcString)
{

RtlZeroMemory(&m_oData.m_factSignature, sizeof(m_oData.m_factSignature) );

if
( !_ptcString ) return;

for
(LPCTSTR ptcChar = _ptcString; *ptcChar; ptcChar++)
{


TCHAR tChar = *ptcChar;
if
( tChar >= 'a' && tChar <= 'z' )
{


DWORD dwIndex = ((DWORD)tChar)-(DWORD)'a';
m_oData.m_factSignature[dwIndex]++;
}


else if
( tChar >= 'A' && tChar <= 'Z' )
{


DWORD dwIndex = 26 + ((DWORD)tChar)-(DWORD)'Z';

m_oData.m_factSignature[dwIndex]++;
}

else

{
// stop if not in [a-zA-Z]
return;
}
}
}



//==================================================
// must define this for using CAnagramSignature as a map<> key
//==================================================
bool operator<( const CAnagramSignature & _roLeft,
const
CAnagramSignature & _roRight)
{


for
( long K=0; K<CAnagramSignature::DWORD_COUNT; K++)
{


if
( _roLeft.m_oData.m_fadwSignature[K] <
_roRight.m_oData.m_fadwSignature[K] ) return(true);

if
( _roLeft.m_oData.m_fadwSignature[K] >
_roRight.m_oData.m_fadwSignature[K] ) return(false);

// else equal, keep looking
}

return
(false);
}


//==================================================
CAnagramDictionary::CAnagramDictionary(LPCTSTR _ptcFileName)
{


InitializeCriticalSection(&m_oCriticalSection);
if
( _ptcFileName ) LoadWordList(_ptcFileName);
}


//==================================================
CAnagramDictionary::~CAnagramDictionary()
{

DeleteCriticalSection(&m_oCriticalSection);
}


//==================================================
string CAnagramDictionary::ThreadSafeLookup(
const
CAnagramSignature & _roSignature)
{


// protect this from multiple threads
CScopeProtector oScopeProtector(m_oCriticalSection);
// note that there is an implicit conversion of the string
// to a CAnagramSignature object inside the []
string & roszList = m_oSignatureMap[_roSignature];

if
( 0 >= roszList.size() )
{

// !!string not present
// Clear the entry that we entered cause it does not belong
// Note that we *could* learn new words/strings along the way,
// but that is not yet required
m_oSignatureMap[_roSignature].erase();

return
("<NO ENTRY>");
}

// Copying strings is more costly but inherently safe.
// It would be more efficient to return a const char pointer if
// thread safety is externally implemented. However, the lookup
// method inserts into the map automagically, so we can't use
// the "read-only" nature for thread safety.
return(roszList);
}


//==================================================
bool CAnagramDictionary::LoadWordList(LPCTSTR _ptcFileName)
{

struct
_timeb oStart, oEnd;

_ftime( &oStart);

FILE * poInput = fopen(_ptcFileName, "r");

if
( !poInput ) return(false);

char
fatcInputLine[MAX_LINEREAD_BUFFER];

while
( !feof(poInput) && !ferror(poInput) )
{

fgets(fatcInputLine, MAX_STRING_LENGTH, poInput);

if
( ferror(poInput) ) continue;
// null terminate for safety
fatcInputLine[MAX_STRING_LENGTH]=0;

// kill the terminating newline
char * pcNL = strchr(fatcInputLine, '\n');

if
(pcNL) *pcNL = 0;

string & roString = m_oSignatureMap[fatcInputLine];

if
( 0 < roString.size() )
{

roString += "\r\n";
}


roString += fatcInputLine;
}


_ftime( &oEnd);
m_fLoadTime = oEnd.time*1.0f + oEnd.millitm/1000.0f -
oStart.time*1.0f - oStart.millitm/1000.0f;

fprintf(stderr,_T(
"CAnagramDictionary::LoadWordList load time %.3f seconds\n"
),
m_fLoadTime);
return
(true);
}



//==================================================
CWordList::CWordList(LPCTSTR _ptcFileName)
{

InitializeCriticalSection(&m_oCriticalSection);
if
( _ptcFileName ) LoadWordList(_ptcFileName);
}


//==================================================
CWordList::~CWordList()
{

DeleteCriticalSection(&m_oCriticalSection);
}


//==================================================
bool CWordList::LoadWordList(LPCTSTR _ptcFileName)
{


struct
_timeb oStart, oEnd;
_ftime( &oStart);

FILE * poInput = fopen(_ptcFileName, "r");

if
( !poInput ) return(false);

char
fatcInputLine[MAX_LINEREAD_BUFFER];

while
( !feof(poInput) && !ferror(poInput) )
{

fgets(fatcInputLine, MAX_STRING_LENGTH, poInput);

if
( ferror(poInput) ) continue;
// null terminate for safety
fatcInputLine[MAX_STRING_LENGTH]=0;

// kill the terminating newline
char * pcNL = strchr(fatcInputLine, '\n');

if
(pcNL) *pcNL = 0;

// we presume uniqueness in our word list
m_oWordList.push_back( fatcInputLine );
}


_ftime( &oEnd);
m_fLoadTime = oEnd.time*1.0f + oEnd.millitm/1000.0f -
oStart.time*1.0f - oStart.millitm/1000.0f;

fprintf(stderr, _T(
"CWordList::LoadWordList load time %.3f seconds\n"
),
m_fLoadTime);
return
(true);
}


//==================================================
string CWordList::ThreadSafeLookup(DWORD _dwIndex)
{

// protect this from multiple threads
CScopeProtector oScopeProtector(m_oCriticalSection);

// make the index modulo the length
_dwIndex = _dwIndex % m_oWordList.size();

string & roszEntry = m_oWordList[ _dwIndex ];
// Copying strings is more costly but inherently safe.
// It would be more efficient to return a const char pointer if
// thread safety is externally implemented. Also, we may be able to use
// the "read-only" nature for thread safety as long as the structure
// does not change.

return(roszEntry);
}