/*
 * Listing 12 -- model-2.c
 *
 * This module contains all of the modeling functions used with
 * comp-2.c and expand-2.c.  This modeling unit keeps track of
 * all contexts from 0 up to max_order, which defaults to 3.
 * In addition, there is a special context -1 which is a fixed model
 * used to encode previously unseen characters, and a context -2
 * which is used to encode EOF and FLUSH messages.
 *
 * Each context is stored in a special CONTEXT structure, which is
 * documented below.  Context tables are not created until the
 * context is seen, and they are never destroyed.
 *
 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "coder.h"
#include "model.h"

/*
 * This program consumes massive amounts of memory.  One way to
 * handle large amounts of memory is to use Zortech's __handle
 * pointer type.  So that my code will run with other compilers
 * as well, the handle stuff gets redefined when using other
 * compilers.
 */
#ifdef __ZTC__
#include <handle.h>
#else
#define __handle
#define handle_calloc( a )        calloc( (a), 1 )
#define handle_realloc( a, b )    realloc( (a), (b) )
#define handle_free( a )          free( (a) )
#endif

/* A context table contains a list of the counts for all symbols
 * that have been seen in the defined context.  For example, a
 * context of "Zor" might have only had 2 different characters
 * appear.  't' might have appeared 10 times, and 'l' might have
 * appeared once.  These two counts are stored in the context
 * table.  The counts are stored in the STATS structure.  All of
 * the counts for a given context are stored in and array of STATS.
 * As new characters are added to a particular contexts, the STATS
 * array will grow.  Sometimes, the STATS array will shrink
 * after flushing the model.
 */
typedef struct {
                unsigned char symbol;
                unsigned char counts;
               } STATS;

/*
 * Each context has to have links to higher order contexts.  These
 * links are used to navigate through the context tables.  For example,
 * to find the context table for "ABC", I start at the order 0 table,
 * then find the pointer to the "A" context table by looking through
 * then LINKS array.  At that table, we find the "B" link and go to
 * that table.  The process continues until the destination table is
 * found.  The table pointed to by the LINKS array corresponds to the
 * symbol found at the same offset in the STATS table.  The reason that
 * LINKS is in a separate structure instead of being combined with
 * STATS is to save space.  All of the leaf context nodes don't need
 * next pointers, since they are in the highest order context.  In the
 * leaf nodes, the LINKS array is a NULL pointers.
 */
typedef struct {
                 struct context *next;
               } LINKS;

/*
 * The CONTEXT structure holds all of the know information about
 * a particular context.  The links and stats pointers are discussed
 * immediately above here.  The max_index element gives the maximum
 * index that can be applied to the stats or link array.  When the
 * table is first created, and stats is set to NULL, max_index is set
 * to -1.  As soon as single element is added to stats, max_index is
 * incremented to 0.
 *
 * The lesser context pointer is a navigational aid.  It points to
 * the context that is one less than the current order.  For example,
 * if the current context is "ABC", the lesser_context pointer will
 * point to "BC".  The reason for maintaining this pointer is that
 * this particular bit of table searching is done frequently, but
 * the pointer only needs to be built once, when the context is
 * created.
 */
typedef struct context {
                         int max_index;
                         LINKS __handle *links;
                         STATS __handle *stats;
                         struct context *lesser_context;
                       } CONTEXT;
/*
 * max_order is the maximum order that will be maintained by this
 * program.  EXPAND-2 and COMP-2 both will modify this int based
 * on command line parameters.
 */
int max_order=3;
/*
 * *contexts[] is an array of current contexts.  If I want to find
 * the order 0 context for the current state of the model, I just
 * look at contexts[0].  This array of context pointers is set up
 * every time the model is updated.
 */
CONTEXT **contexts;
/*
 * current_order contains the current order of the model.  It starts
 * at max_order, and is decremented every time an ESCAPE is sent.  It
 * will only go down to -1 for normal symbols, but can go to -2 for
 * EOF and FLUSH.
 */
int current_order;
/*
 * This variable tells COMP-2.C that the FLUSH symbol can be
 * sent using this model.
 */
int flushing_enabled=1;
/*
 * This table contains the cumulative totals for the current context.
 * Because this program is using exclusion, totals has to be calculated
 * every time a context is used.  The scoreboard array keeps track of
 * symbols that have appeared in higher order models, so that they
 * can be excluded from lower order context total calculations.
 */
short int totals[ 258 ];
char scoreboard[ 256 ];

/*
 * Local procedure declarations.
 */
void error_exit( char *message );
void update_table( CONTEXT *table, unsigned char symbol );
void rescale_table( CONTEXT *table );
void totalize_table( CONTEXT *table );
CONTEXT *shift_to_next_context( CONTEXT *table, unsigned char c, int order);
CONTEXT *allocate_next_order_table( CONTEXT *table,
                                    unsigned char symbol,
                                    CONTEXT *lesser_context );

/*
 * This routine has to get everything set up properly so that
 * the model can be maintained properly.  The first step is to create
 * the *contexts[] array used later to find current context tables.
 * The *contexts[] array indices go from -2 up to max_order, so
 * the table needs to be fiddled with a little.  This routine then
 * has to create the special order -2 and order -1 tables by hand,
 * since they aren't quite like other tables.  Then the current
 * context is set to \0, \0, \0, ... and the appropriate tables
 * are built to support that context.  The current order is set
 * to max_order, the scoreboard is cleared, and the system is
 * ready to go.
 */

void initialize_model()
{
    int i;
    CONTEXT *null_table;
    CONTEXT *control_table;

    current_order = max_order;
    contexts = (CONTEXT **) calloc( sizeof( CONTEXT * ), 10 );
    if ( contexts == NULL )
        error_exit( "Failure #1: allocating context table!" );
    contexts += 2;
    null_table = (CONTEXT *) calloc( sizeof( CONTEXT ), 1 );
    if ( null_table == NULL )
        error_exit( "Failure #2: allocating null table!" );
    null_table->max_index = -1;
    contexts[ -1 ] = null_table;
    for ( i = 0 ; i <= max_order ; i++ )
        contexts[ i ] = allocate_next_order_table( contexts[ i-1 ],
                                               0,
                                               contexts[ i-1 ] );
    handle_free( (char __handle *) null_table->stats );
    null_table->stats =
         (STATS __handle *) handle_calloc( sizeof( STATS ) * 256 );
    if ( null_table->stats == NULL )
        error_exit( "Failure #3: allocating null table!" );
    null_table->max_index = 255;
    for ( i=0 ; i < 256 ; i++ )
    {
        null_table->stats[ i ].symbol = (unsigned char) i;
        null_table->stats[ i ].counts = 1;
    }

    control_table = (CONTEXT *) calloc( sizeof(CONTEXT), 1 );
    if ( control_table == NULL )
        error_exit( "Failure #4: allocating null table!" );
    control_table->stats =
         (STATS __handle *) handle_calloc( sizeof( STATS ) * 2 );
    if ( control_table->stats == NULL )
        error_exit( "Failure #5: allocating null table!" );
    contexts[ -2 ] = control_table;
    control_table->max_index = 1;
    control_table->stats[ 0 ].symbol = -FLUSH;
    control_table->stats[ 0 ].counts = 1;
    control_table->stats[ 1 ].symbol =- DONE;
    control_table->stats[ 1 ].counts = 1;

    for ( i = 0 ; i < 256 ; i++ )
        scoreboard[ i ] = 0;
}
/*
 * This is a utility routine used to create new tables when a new
 * context is created.  It gets a pointer to the current context,
 * and gets the symbol that needs to be added to it.  It also needs
 * a pointer to the lesser context for the table that is to be
 * created.  For example, if the current context was "ABC", and the
 * symbol 'D' was read in, add_character_to_model would need to
 * create the new context "BCD".  This routine would get called
 * with a pointer to "BC", the symbol 'D', and a pointer to context
 * "CD".  This routine then creates a new table for "BCD", adds it
 * to the link table for "BC", and gives "BCD" a back pointer to
 * "CD".  Note that finding the lesser context is a difficult
 * task, and isn't done here.  This routine mainly worries about
 * modifying the stats and links fields in the current context.
 */

CONTEXT *allocate_next_order_table( CONTEXT *table,
                                    unsigned char symbol,
                                    CONTEXT *lesser_context )
{
    CONTEXT *new_table;
    int i;
    unsigned int new_size;

    for ( i = 0 ; i <= table->max_index ; i++ )
        if ( table->stats[ i ].symbol == symbol )
            break;
    if ( i > table->max_index )
    {
        table->max_index++;
        new_size = sizeof( LINKS );
        new_size *= table->max_index + 1;
        if ( table->links == NULL )
            table->links = (LINKS __handle *) handle_calloc( new_size );
        else
            table->links = (LINKS __handle *)
                 handle_realloc( (char __handle *) table->links, new_size );
        new_size = sizeof( STATS );
        new_size *= table->max_index + 1;
        if ( table->stats == NULL )
            table->stats = (STATS __handle *) handle_calloc( new_size );
        else
            table->stats = (STATS __handle *)
                handle_realloc( (char __handle *) table->stats, new_size );
        if ( table->links == NULL )
            error_exit( "Failure #6: allocating new table" );
        if ( table->stats == NULL )
            error_exit( "Failure #7: allocating new table" );
        table->stats[ i ].symbol = symbol;
        table->stats[ i ].counts = 0;
    }
    new_table = (CONTEXT *) calloc( sizeof( CONTEXT ), 1 );
    if ( new_table == NULL )
        error_exit( "Failure #8: allocating new table" );
    new_table->max_index = -1;
    table->links[ i ].next = new_table;
    new_table->lesser_context = lesser_context;
    return( new_table );
}

/*
 * This routine is called to increment the counts for the current
 * contexts.  It is called after a character has been encoded or
 * decoded.  All it does is call update_table for each of the
 * current contexts, which does the work of incrementing the count.
 * This particular version of update_model() practices update exclusion,
 * which means that if lower order models weren't used to encode
 * or decode the character, they don't get their counts updated.
 * This seems to improve compression performance quite a bit.
 * To disable update exclusion, the loop would be changed to run
 * from 0 to max_order, instead of current_order to max_order.
 */
void update_model( int symbol )
{
    int i;
    int local_order;

    if ( current_order < 0 )
        local_order = 0;
    else
        local_order = current_order;
    if ( symbol >= 0 )
    {
        while ( local_order <= max_order )
        {
            if ( symbol >= 0 )
                update_table( contexts[ local_order ], (unsigned char) symbol );
            local_order++;
        }
    }
    current_order = max_order;
    for ( i = 0 ; i < 256 ; i++ )
        scoreboard[ i ] = 0;
}
/*
 * This routine is called to update the count for a particular symbol
 * in a particular table.  The table is one of the current contexts,
 * and the symbol is the last symbol encoded or decoded.  In principle
 * this is a fairly simple routine, but a couple of complications make
 * things a little messier.  First of all, the given table may not
 * already have the symbol defined in its statistics table.  If it
 * doesn't, the stats table has to grow and have the new guy added
 * to it.  Secondly, the symbols are kept in sorted order by count
 * in the table so as that the table can be trimmed during the flush
 * operation.  When this symbol is incremented, it might have to be moved
 * up to reflect its new rank.  Finally, since the counters are only
 * bytes, if the count reaches 255, the table absolutely must be rescaled
 * to get the counts back down to a reasonable level.
 */
void update_table( CONTEXT *table, unsigned char symbol )
{
    int i;
    int index;
    unsigned char temp;
    CONTEXT *temp_ptr;
    unsigned int new_size;
/*
 * First, find the symbol in the appropriate context table.  The first
 * symbol in the table is the most active, so start there.
 */
    index = 0;
    while ( index <= table->max_index &&
            table->stats[index].symbol != symbol )
        index++;
    if ( index > table->max_index )
    {
        table->max_index++;
        new_size = sizeof( LINKS );
        new_size *= table->max_index + 1;
        if ( current_order < max_order )
        {
            if ( table->max_index == 0 )
                table->links = (LINKS __handle *) handle_calloc( new_size );
            else
                table->links = (LINKS __handle *)
                   handle_realloc( (char __handle *) table->links, new_size );
            if ( table->links == NULL )
                error_exit( "Error #9: reallocating table space!" );
            table->links[ index ].next = NULL;
        }
        new_size = sizeof( STATS );
        new_size *= table->max_index + 1;
        if (table->max_index==0)
            table->stats = (STATS __handle *) handle_calloc( new_size );
        else
            table->stats = (STATS __handle *)
                handle_realloc( (char __handle *) table->stats, new_size );
        if ( table->stats == NULL )
            error_exit( "Error #10: reallocating table space!" );
        table->stats[ index ].symbol = symbol;
        table->stats[ index ].counts = 0;
    }
/*
 * Now I move the symbol to the front of its list.
 */
    i = index;
    while ( i > 0 &&
            table->stats[ index ].counts == table->stats[ i-1 ].counts )
        i--;
    if ( i != index )
    {
        temp = table->stats[ index ].symbol;
        table->stats[ index ].symbol = table->stats[ i ].symbol;
        table->stats[ i ].symbol = temp;
        if ( table->links != NULL )
        {
            temp_ptr = table->links[ index ].next;
            table->links[ index ].next = table->links[ i ].next;
            table->links[ i ].next = temp_ptr;
        }
        index = i;
    }
/*
 * The switch has been performed, now I can update the counts
 */
    table->stats[ index ].counts++;
    if ( table->stats[ index ].counts == 255 )
        rescale_table( table );
}

/*
 * This routine is called when a given symbol needs to be encoded.
 * It is the job of this routine to find the symbol in the context
 * table associated with the current table, and return the low and
 * high counts associated with that symbol, as well as the scale.
 * Finding the table is simple.  Unfortunately, once I find the table,
 * I have to build the table of cumulative counts, which is
 * expensive, and is done elsewhere.  If the symbol is found in the
 * table, the appropriate counts are returned.  If the symbols is
 * not found, the ESCAPE symbol probabilities are returned, and
 * the current order is reduced.  Note also the kludge to support
 * the order -2 character set, which consists of negative numbers
 * instead of unsigned chars.  This insures that no match will every
 * be found for the EOF or FLUSH symbols in any "normal" table.
 */
int convert_int_to_symbol( int c, SYMBOL *s )
{
    int i;
    CONTEXT *table;

    table = contexts[ current_order ];
    totalize_table( table );
    s->scale = totals[ 0 ];
    if ( current_order == -2 )
        c = -c;
    for ( i = 0 ; i <= table->max_index ; i++ )
    {
        if ( c == (int) table->stats[ i ].symbol )
        {
            if ( table->stats[ i ].counts == 0 )
                break;
            s->low_count = totals[ i+2 ];
            s->high_count = totals[ i+1 ];
            return( 0 );
        }
    }
    s->low_count = totals[ 1 ];
    s->high_count = totals[ 0 ];
    current_order--;
    return( 1 );
}
/*
 * This routine is called when decoding an arithmetic number.  In
 * order to decode the present symbol, the current scale in the
 * model must be determined.  This requires looking up the current
 * table, then building the totals table.  Once that is done, the
 * cumulative total table has the symbol scale at element 0.
 */
void get_symbol_scale( SYMBOL *s )
{
    CONTEXT *table;

    table = contexts[ current_order ];
    totalize_table( table );
    s->scale = totals[ 0 ];
}

/*
 * This routine is called during decoding.  It is given a count that
 * came out of the arithmetic decoder, and has to find the symbol that
 * matches the count.  The cumulative totals are already stored in the
 * totals[] table, form the call to get_symbol_scale, so this routine
 * just has to look through that table.  Once the match is found,
 * the appropriate character is returned to the caller.  Two possible
 * complications.  First, the character might be the ESCAPE character,
 * in which case the current_order has to be decremented.  The other
 * complication is that the order might be -2, in which case we return
 * the negative of the symbol so it isn't confused with a normal
 * symbol.
 */
int convert_symbol_to_int( int count, SYMBOL *s)
{
    int c;
    CONTEXT *table;

    table = contexts[ current_order ];
    for ( c = 0; count < totals[ c ] ; c++ )
        ;
    s->high_count = totals[ c-1 ];
    s->low_count = totals[ c ];
    if ( c == 1 )
    {
        current_order--;
        return( ESCAPE );
    }
    if ( current_order < -1 )
        return( (int) -table->stats[ c-2 ].symbol );
    else
        return( table->stats[ c-2 ].symbol );
}


/*
 * After the model has been updated for a new character, this routine
 * is called to "shift" into the new context.  For example, if the
 * last context was "ABC", and the symbol 'D' had just been processed,
 * this routine would want to update the context pointers to that
 * contexts[1]=="D", contexts[2]=="CD" and contexts[3]=="BCD".  The
 * potential problem is that some of these tables may not exist.
 * The way this is handled is by the shift_to_next_context routine.
 * It is passed a pointer to the "ABC" context, along with the symbol
 * 'D', and its job is to return a pointer to "BCD".  Once we have
 * "BCD", we can follow the lesser context pointers in order to get
 * the pointers to "CD" and "C".  The hard work was done in
 * shift_to_context().
 */
void add_character_to_model( int c )
{
    int i;
    if ( max_order < 0 || c < 0 )
       return;
    contexts[ max_order ] =
       shift_to_next_context( contexts[ max_order ],
                              (unsigned char) c, max_order );
    for ( i = max_order-1 ; i > 0 ; i-- )
        contexts[ i ] = contexts[ i+1 ]->lesser_context;
}

/*
 * This routine is called when adding a new character to the model. From
 * the previous example, if the current context was "ABC", and the new
 * symbol was 'D', this routine would get called with a pointer to
 * context table "ABC", and symbol 'D', with order max_order.  What this
 * routine needs to do then is to find the context table "BCD".  This
 * should be an easy job, and it is if the table already exists.  All
 * we have to in that case is follow the back pointer from "ABC" to "BC".
 * We then search the link table of "BC" until we find the linke to "D".
 * That link points to "BCD", and that value is then returned to the
 * caller.  The problem crops up when "BC" doesn't have a pointer to
 * "BCD".  This generally means that the "BCD" context has not appeared
 * yet.  When this happens, it means a new table has to be created and
 * added to the "BC" table.  That can be done with a single call to
 * the allocate_new_table routine.  The only problem is that the
 * allocate_new_table routine wants to know what the lesser context for
 * the new table is going to be.  In other words, when I create "BCD",
 * I need to know where "CD" is located.  In order to find "CD", I
 * have to recursively call shift_to_next_context, passing it a pointer
 * to context "C" and they symbol 'D'.  It then returns a pointer to
 * "CD", which I use to create the "BCD" table.  The recursion is guaranteed
 * to end if it ever gets to order -1, because the null table is
 * guaranteed to have a for every symbol to the order 0 table.  This is
 * the most complicated part of the modeling program, but it is
 * necessary for performance reasons.
 */
CONTEXT *shift_to_next_context( CONTEXT *table, unsigned char c, int order)
{
    int i;
    CONTEXT *new_lesser;
/*
 * First, try to find the new context by backing up to the lesser
 * context and searching its link table.  If I find the link, we take
 * a quick and easy exit, returning the link.  Note that their is a
 * special Kludge for context order 0.  We know for a fact that
 * the lesser context pointer at order 0 points to the null table,
 * order -1, and we know that the -1 table only has a single link
 * pointer, which points back to the order 0 table.
 */
    table = table->lesser_context;
    if ( order == 0 )
        return( table->links[ 0 ].next );
    for ( i = 0 ; i <= table->max_index ; i++ )
        if ( table->stats[ i ].symbol == c )
            if ( table->links[ i ].next != NULL )
                return( table->links[ i ].next );
            else
                break;
/*
 * If I get here, it means the new context did not exist.  I have to
 * create the new context, add a link to it here, and add the backwards
 * link to *his* previous context.  Creating the table and adding it to
 * this table is pretty easy, but adding the back pointer isn't.  Since
 * creating the new back pointer isn't easy, I duck my responsibility
 * and recurse to myself in order to pick it up.
 */
    new_lesser = shift_to_next_context( table, c, order-1 );
/*
 * Now that I have the back pointer for this table, I can make a call
 * to a utility to allocate the new table.
 */
    table = allocate_next_order_table( table, c, new_lesser );
    return( table );
}

/*
 * Rescaling the table needs to be done for one of three reasons.
 * First, if the maximum count for the table has exceeded 16383, it
 * means that arithmetic coding using 16 and 32 bit registers might
 * no longer work.  Secondly, if an individual symbol count has
 * reached 255, it will no longer fit in a byte.  Third, if the
 * current model isn't compressing well, the compressor program may
 * want to rescale all tables in order to give more weight to newer
 * statistics.  All this routine does is divide each count by 2.
 * If any counts drop to 0, the counters can be removed from the
 * stats table, but only if this is a leaf context.  Otherwise, we
 * might cut a link to a higher order table.
 */
void rescale_table( CONTEXT *table )
{
    int i;

    if ( table->max_index == -1 )
        return;
    for ( i = 0 ; i <= table->max_index ; i++ )
        table->stats[ i ].counts /= 2;
    if ( table->stats[ table->max_index ].counts == 0 &&
         table->links == NULL )
    {
        while ( table->stats[ table->max_index ].counts == 0 &&
                table->max_index >= 0 )
            table->max_index--;
        if ( table->max_index == -1 )
        {
            handle_free( (char __handle *) table->stats );
            table->stats = NULL;
        }
        else
        {
            table->stats = (STATS __handle *)
                handle_realloc( (char __handle *) table->stats,
                                 sizeof( STATS ) * ( table->max_index + 1 ) );
            if ( table->stats == NULL )
                error_exit( "Error #11: reallocating stats space!" );
        }
    }
}

/*
 * This routine has the job of creating a cumulative totals table for
 * a given context.  The cumulative low and high for symbol c are going to
 * be stored in totals[c+2] and totals[c+1].  Locations 0 and 1 are
 * reserved for the special ESCAPE symbol.  The ESCAPE symbol
 * count is calculated dynamically, and changes based on what the
 * current context looks like.  Note also that this routine ignores
 * any counts for symbols that have already showed up in the scoreboard,
 * and it adds all new symbols found here to the scoreboard.  This
 * allows us to exclude counts of symbols that have already appeared in
 * higher order contexts, improving compression quite a bit.
 */
void totalize_table( CONTEXT *table )
{
    int i;
    unsigned char max;

    for ( ; ; )
    {
        max = 0;
        i = table->max_index + 2;
        totals[ i ] = 0;
        for ( ; i > 1 ; i-- )
        {
            totals[ i-1 ] = totals[ i ];
            if ( table->stats[ i-2 ].counts )
                if ( ( current_order == -2 ) ||
                     scoreboard[ table->stats[ i-2 ].symbol ] == 0 )
                     totals[ i-1 ] += table->stats[ i-2 ].counts;
            if ( table->stats[ i-2 ].counts > max )
                max = table->stats[ i-2 ].counts;
        }
/*
 * Here is where the escape calculation needs to take place.
 */
        if ( max == 0 )
            totals[ 0 ] = 1;
        else
        {
            totals[ 0 ] = (short int) ( 256 - table->max_index );
            totals[ 0 ] *= table->max_index;
            totals[ 0 ] /= 256;
            totals[ 0 ] /= max;
            totals[ 0 ]++;
            totals[ 0 ] += totals[ 1 ];
        }
        if ( totals[ 0 ] < MAXIMUM_SCALE )
            break;
        rescale_table( table );
    }
    for ( i = 0 ; i < table->max_index ; i++ )
	if (table->stats[i].counts != 0)
            scoreboard[ table->stats[ i ].symbol ] = 1;
}

/*
 * This routine is called when the entire model is to be flushed.
 * This is done in an attempt to improve the compression ratio by
 * giving greater weight to upcoming statistics.  This routine
 * starts at the given table, and recursively calls itself to
 * rescale every table in its list of links.  The table itself
 * is then rescaled.
 */
void recursive_flush( CONTEXT *table )
{
    int i;

    if ( table->links != NULL )
        for ( i = 0 ; i <= table->max_index ; i++ )
            if ( table->links[ i ].next != NULL )
                recursive_flush( table->links[ i ].next );
    rescale_table( table );
}

/*
 * This routine is called to flush the whole table, which it does
 * by calling the recursive flush routine starting at the order 0
 * table.
 */
void flush_model()
{
    recursive_flush( contexts[ 0 ] );
}

void error_exit( char *message)
{
    putc( '\n', stdout );
    puts( message );
    exit( -1 );
}



