//
// MakeDictA.cpp 
//
// by Mark Nelson - February 2002
//
// This console based C++ program creates a dictionary for
// use in star encoding. It expects a list of input files
// on the command line, which it reads into a word frequency
// table. The program then creates a dictionary of star 
// encodings and outputs it to stdout. 
//
// The star encoding created by this program is described in
// the article. In essence, it creates a star code using the
// same number of characters as the given word, using as little
// plain text as possible.
//
// Once the dictionary has been created, you can encode and
// decode with StarEncode.cpp and StarDecode.cpp.
//
// This code compiles with Visual C++ 6 SP3 and g++ 2.96:
// 
// cl /GX MakeDictA.cpp
//  - or -
//  g++ -o MakeDictA MakeDictA.cpp
//
#pragma warning( disable : 4786 )
#include <string>
#include <iostream>
#include <fstream>
#include <map>
#include <set>
#include <vector>
#include <algorithm>
using namespace std;

//
// Forward declarations
//
//
string get_token( ifstream &file );
string create_star_code( const string& token, const set<string> &used_codes );

int main(int argc, char* argv[])
{
    if ( argc < 2 )
    {
        cerr << "Usage: dict0 file1 [file2 .... filen]\n\n"
             << "writes the dictionary to stdout\n";
        exit( 0 );
    }
//
// Step 1 is to extract all tokens from the input files and accumulate
// them in the frequency map.
// 
    map< string, int> frequency;
    int pacifier = 0;
    cerr << "Counting words...\n";
    while ( --argc > 0 )
    {
        ifstream infile( *++argv, ios::in | ios::binary );
        cerr << *argv << endl;
        infile.unsetf( ios::skipws );
        //
	// After the file has been opened, I repeatedly extract tokens.
	// Each token is either added to the table or has its count
	// updated.
	for ( ; ; )
        {
            string token = get_token( infile );
            if ( token.size() == 0 )
                break;
            map< string, int >::const_iterator ii = frequency.find( token );
            if ( ii == frequency.end() )
                frequency[ token ] = 1;
            else
                frequency[ token ]++;
            cerr << pacifier++ << "\r";
        }
        cerr << "\n";
    }
//
// The frequency table is now complete. In order to create the star codes,
// we need a table of tokens ordered by their frequency. To get that, I 
// simply insert the tokens into a new map that uses frequency as the 
// key instead of the string.
// 
    cerr << "Sorting frequencies...\n";
    multimap< int, string > counts;
    for ( map< string, int>::iterator ii = frequency.begin() ; 
          ii != frequency.end();
          ii++ )
      counts.insert( pair<int,string>( (*ii).second, (*ii).first ) );
//
// Now I just interate through the counts map, getting each token and
// creating a star code for it. As I create the star codes I output the
// token, its star code, and the tokens count to stdout - which is
// presumably being piped to the dictionary file.
//
    map< string, string > codes;
    set<string> used_codes;
    cerr << "Creating star codes...\n";
    pacifier = 0;
    for ( multimap< int, string >::reverse_iterator jj = counts.rbegin() ; jj != counts.rend() ; jj++ )
    {
        string code = create_star_code( (*jj).second, used_codes );
        used_codes.insert( code );
        codes[ (*jj).second ] = code;
        cout << (*jj).second << " " << code << " " << (*jj).first << endl;
        cerr << pacifier++ << "\r";
    }
    cerr << "\n";
    return 0;
}

//
// This routine is used to get tokens from a file. For
// star encoding, we want to get complete alphanumeric 
// tokens. Note that it just skips over and ignores
// punctuation and white space. The one exception to this
// rule is the single quote - it tokenizes those as well.
//

string get_token( ifstream &file )
{
    string token;
    while ( file )
    {
        char c;
        file >> c;
        if ( isalpha( c ) || c == '\'' )
            token += c;
        else if ( token.size() )
            return token;
    }
    return token;
}


string create_star_code( const string& token, const set<string> &used_codes )
{
//
// We try to create star codes with progressively fewer stars. If we get to 0 stars, we just
// insert the plain text in the code book
//
    for ( int star_count = token.size() ; star_count > 0 ; star_count-- )
    {
        vector<char> pattern( token.size(), '*' );
        for ( int i = star_count ; i < token.size() ; i++ )
            pattern[ i ] = '-';
        do 
        {
            string test( token );
            for ( int j = 0 ; j < token.size() ; j++ )
            if ( pattern[ j ] == '*' )
                test[ j ] = '*';
            set<string>::const_iterator kk = used_codes.find( test );
            if ( kk == used_codes.end() )
                return test;
       } while ( next_permutation( pattern.begin(), pattern.end() ) );
   }
   return token;
}


