/********************************************************************\
 * This program is free software; you can redistribute it and/or    *
 * modify it under the terms of the GNU General Public License as   *
 * published by the Free Software Foundation; either version 2 of   *
 * the License, or (at your option) any later version.              *
 *                                                                  *
 * This program is distributed in the hope that it will be useful,  *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of   *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    *
 * GNU General Public License for more details.                     *
 *                                                                  *
 * You should have received a copy of the GNU General Public License*
 * along with this program; if not, contact:                        *
 *                                                                  *
 * Free Software Foundation           Voice:  +1-617-542-5942       *
 * 51 Franklin Street, Fifth Floor    Fax:    +1-617-542-2652       *
 * Boston, MA  02110-1301,  USA       gnu@gnu.org                   *
\********************************************************************/
/** @addtogroup Import_Export
    @{ */
/** @internal
	@file import-match-map.c
    @brief Generic import mapper service, maps strings->accounts
    *
    An import mapper service that stores Account Maps for the
    generic importer.  This allows importers to map various
    "strings" to Gnucash accounts in a generic manner.
    @author Copyright (C) 2002,2003 Derek Atkins <derek@ihtfp.com>
 */
#include "config.h"
#include <string.h>
#include <glib.h>
#include "import-match-map.h"
#include "gnc-ui-util.h"
#include "gnc-engine.h"
#include "gnc-features.h"

/********************************************************************\
 *   Constants   *
\********************************************************************/

static QofLogModule log_module = GNC_MOD_IMPORT;


struct _GncImportMatchMap
{
    kvp_frame *	frame;
    Account *	acc;
    QofBook *	book;
};

#define IMAP_FRAME		"import-map"
#define IMAP_FRAME_BAYES	"import-map-bayes"

static GncImportMatchMap *
gnc_imap_create_from_frame (kvp_frame *frame, Account *acc, QofBook *book)
{
    GncImportMatchMap *imap;

    g_return_val_if_fail (frame != NULL, NULL);
    g_return_val_if_fail ((acc && !book) || (!acc && book), NULL);

    imap = g_new0(GncImportMatchMap, 1);
    imap->frame = frame;

    /* Cache the book for easy lookups; store the account/book for
     * marking dirtiness
     */
    if (acc)
        book = gnc_account_get_book (acc);
    imap->acc = acc;
    imap->book = book;

    return imap;
}

/** Obtain an ImportMatchMap object from an Account or a Book */
GncImportMatchMap * gnc_imap_create_from_account (Account *acc)
{
    kvp_frame * frame;

    if (!acc) return NULL;
    frame = xaccAccountGetSlots (acc);
    g_return_val_if_fail (frame != NULL, NULL);

    return gnc_imap_create_from_frame (frame, acc, NULL);
}

GncImportMatchMap * gnc_imap_create_from_book (QofBook *book)
{
    kvp_frame * frame;

    if (!book) return NULL;
    frame = qof_book_get_slots (book);
    g_return_val_if_fail (frame != NULL, NULL);

    return gnc_imap_create_from_frame (frame, NULL, book);
}

/** Destroy an import map */
void gnc_imap_destroy (GncImportMatchMap *imap)
{
    if (!imap) return;
    g_free (imap);
}

/** Clear an import map -- this removes ALL entries in the map */
void gnc_imap_clear (GncImportMatchMap *imap)
{
    if (!imap) return;
    xaccAccountBeginEdit (imap->acc);
    /* Clear the IMAP_FRAME kvp */
    kvp_frame_set_slot_path (imap->frame, NULL, IMAP_FRAME);

    /* Clear the bayes kvp, IMAP_FRAME_BAYES */
    kvp_frame_set_slot_path (imap->frame, NULL, IMAP_FRAME_BAYES);
    qof_instance_set_dirty (QOF_INSTANCE (imap->acc));
    xaccAccountCommitEdit (imap->acc);
}

/** Look up an Account in the map */
Account * gnc_imap_find_account (GncImportMatchMap *imap, const char *category,
                                 const char *key)
{
    kvp_value *value;
    GncGUID * guid;

    if (!imap || !key) return NULL;
    if (!category)
    {
        category = key;
        key = NULL;
    }

    value = kvp_frame_get_slot_path (imap->frame, IMAP_FRAME, category, key, NULL);
    if (!value) return NULL;

    guid = kvp_value_get_guid (value);
    return xaccAccountLookup (guid, imap->book);
}

/** Store an Account in the map */
void gnc_imap_add_account (GncImportMatchMap *imap, const char *category,
                           const char *key, Account *acc)
{
    kvp_value *value;

    if (!imap || !key || !acc || (strlen (key) == 0)) return;
    if (!category)
    {
        category = key;
        key = NULL;
    }
    g_return_if_fail (acc != NULL);

    value = kvp_value_new_guid (xaccAccountGetGUID (acc));
    g_return_if_fail (value != NULL);
    xaccAccountBeginEdit (imap->acc);
    kvp_frame_set_slot_path (imap->frame, value, IMAP_FRAME, category, key, NULL);
    qof_instance_set_dirty (QOF_INSTANCE (imap->acc));
    xaccAccountCommitEdit (imap->acc);
    kvp_value_delete (value);

    /* XXX Mark the account (or book) as dirty! */
}




/*--------------------------------------------------------------------------
 Below here is the bayes transaction to account matching system
--------------------------------------------------------------------------*/


struct account_token_count
{
    char * account_guid;
    gint64 token_count; /**< occurances of a given token for this account_guid */
};

/** total_count and the token_count for a given account let us calculate the
 * probability of a given account with any single token
 */
struct token_accounts_info
{
    GList *accounts; /**< array of struct account_token_count */
    gint64 total_count;
    QofBook * book;
};

static struct account_token_count * findAccountTokenCount (GList * haystack, struct account_token_count * needle)
{
    GList * iter = haystack;
    for ( ; iter; iter = iter->next)
    {
        struct account_token_count * element = iter->data;
        if (!g_strcmp0 (needle->account_guid, element->account_guid))
            return element;
    }
    return NULL;
}

/**
 * \note Cannot assume keys are unique. Account names and GUIDs can both
 * come through here referring to the same account.
 */
static void buildTokenInfo(const char *key, kvp_value *value, gpointer data)
{
    struct token_accounts_info * tokenInfo = (struct token_accounts_info*)data;
    struct account_token_count * otherTokenInfo = NULL;
    struct account_token_count * this_account;
    GncGUID temp_guid;
    tokenInfo->total_count += kvp_value_get_gint64(value);
    this_account = (struct account_token_count*) g_new0(struct account_token_count, 1);
    if (string_to_guid (key, &temp_guid))
    { /*the key is a guid*/
        this_account->account_guid = g_strdup (key);
    }
    else
    {
        Account * account = gnc_account_lookup_by_full_name(
                gnc_book_get_root_account(tokenInfo->book), key);
        gchar tempbuff [GUID_ENCODING_LENGTH + 1];
        if (!account)
            /* dud record. */
            return;
        guid_to_string_buff (xaccAccountGetGUID (account), tempbuff);
        this_account->account_guid = g_strdup (tempbuff);
    }
    this_account->token_count = kvp_value_get_gint64(value);
    if ((otherTokenInfo = findAccountTokenCount (tokenInfo->accounts, this_account)))
    {
        /* This is a duplicate. Aggregate it. */
        otherTokenInfo->token_count += this_account->token_count;
        g_free (this_account->account_guid);
        g_free (this_account);
    }
    else
    {
        tokenInfo->accounts = g_list_prepend(tokenInfo->accounts, this_account);
    }
}

/** intermediate values used to calculate the bayes probability of a given account
  where p(AB) = (a*b)/[a*b + (1-a)(1-b)], product is (a*b),
  product_difference is (1-a) * (1-b)
 */
struct account_probability
{
    double product; /* product of probabilities */
    double product_difference; /* product of (1-probabilities) */
};

/** convert a hash table of account names and (struct account_probability*)
  into a hash table of 100000x the percentage match value, ie. 10% would be
  0.10 * 100000 = 10000
 */
#define PROBABILITY_FACTOR 100000
static void buildProbabilities(gpointer key, gpointer value, gpointer data)
{
    GHashTable *final_probabilities = (GHashTable*)data;
    struct account_probability *account_p = (struct account_probability*)value;

    /* P(AB) = A*B / [A*B + (1-A)*(1-B)]
     * NOTE: so we only keep track of a running product(A*B*C...)
     * and product difference ((1-A)(1-B)...)
     */
    gint32 probability =
        (account_p->product /
         (account_p->product + account_p->product_difference))
        * PROBABILITY_FACTOR;

    PINFO("P('%s') = '%d'\n", (char*)key, probability);

    g_hash_table_insert(final_probabilities, key, GINT_TO_POINTER(probability));
}

/** Frees an array of the same time that buildProperties built */
static void freeProbabilities(gpointer key, gpointer value, gpointer data)
{
    g_free(key);
    /* free up the struct account_probability that was allocated
     * in gnc_imap_find_account_bayes()
     */
    g_free(value);
}

/** holds an account name and its corresponding integer probability
  the integer probability is some factor of 10
 */
struct account_info
{
    char* account_guid;
    gint32 probability;
};

/** Find the highest probability and the corresponding account name
    store in data, a (struct account_info*)
    NOTE: this is a g_hash_table_foreach() function for a hash table of entries
    key is a  pointer to the account name, value is a gint32, 100000x
    the probability for this account
*/
static void highestProbability(gpointer key, gpointer value, gpointer data)
{
    struct account_info *account_i = (struct account_info*)data;

    /* if the current probability is greater than the stored, store the current */
    if (GPOINTER_TO_INT(value) > account_i->probability)
    {
        /* Save the new highest probability and the assoaciated account name */
        account_i->probability = GPOINTER_TO_INT(value);
        g_free (account_i->account_guid);
        account_i->account_guid = g_strdup (key);
    }
}

static struct token_accounts_info *
get_flat_account_tokens (char const * token, KvpFrame * imap_frame, QofBook * book)
{
    gchar * path_prefix = g_strdup_printf ("%s/%s", IMAP_FRAME_BAYES, token);
    GList * matching_keys = kvp_frame_get_keys_matching_prefix (imap_frame, path_prefix);
    GList const * iter = matching_keys;
    struct token_accounts_info * ret = (struct token_accounts_info *)
        g_new0 (struct token_accounts_info, 1);
    for ( ; iter; iter = iter->next)
    {
        /* This needs to be const because the string actually
         * belongs to the string cache in KVP. */
        gchar const * temp_key = iter->data;
        struct account_token_count * temp_token_count = (struct account_token_count *)
            g_new0 (struct account_token_count, 1);
        /*The guid part of the key is just after the last '/'.*/
        gchar const * guid_str = g_strrstr (temp_key, "/") + 1;
        gchar * temp_path = g_strdup_printf ("%s/%s", path_prefix, guid_str);
        KvpValue const * temp_count_value = kvp_frame_get_slot_path (imap_frame, temp_path, NULL);
        GncGUID guid;
        Account const * temp_account;
        string_to_guid(guid_str, &guid);
        temp_account = xaccAccountLookup (&guid, book);
        temp_token_count->account_guid = g_strdup (guid_str);
        temp_token_count->token_count = kvp_value_get_gint64 (temp_count_value);
        ret->total_count += temp_token_count->token_count;
        ret->accounts = g_list_prepend (ret->accounts, temp_token_count);
        g_free (temp_path);
    }
    g_list_free (matching_keys);
    g_free (path_prefix);
    return ret;
}

static struct token_accounts_info *
get_not_flat_account_tokens (char const * token, KvpFrame * imap_frame, QofBook * book)
{
    KvpValue * value;
    KvpFrame * token_frame;
    struct token_accounts_info * tokenInfo = (struct token_accounts_info *)
        g_new0 (struct token_accounts_info, 1); /**< holds the accounts and total
             * token count for a single token */
    tokenInfo->book = book;
    /* zero out the token_accounts_info structure */
    PINFO("token: '%s'", token);
    /* find the slot for the given token off of the source account
     * for these tokens, search off of the IMAP_FRAME_BAYES path so
     * we aren't looking from the parent of the entire kvp tree
     */
    value = kvp_frame_get_slot_path(imap_frame, IMAP_FRAME_BAYES, token, NULL);
    /* if value is null we should skip over this token */
    if (!value)
        return NULL;
    /* convert the slot(value) into a the frame that contains the
     * list of accounts
     */
    token_frame = kvp_value_get_frame(value);
    /* token_frame should NEVER be null */
    if (!token_frame)
    {
        PERR("token '%s' has no accounts", token);
        return NULL;
    }
    /* process the accounts for this token, adding the account if it
     * doesn't already exist or adding to the existing accounts token
     * count if it does
     */
    kvp_frame_for_each_slot(token_frame, buildTokenInfo, tokenInfo);
    return tokenInfo;
}

static struct token_accounts_info *
get_account_tokens (char const * token, KvpFrame * imap_frame, QofBook * book,
        gboolean flat_bayes)
{
    struct token_accounts_info * ret;
    if (flat_bayes)
        return get_flat_account_tokens (token, imap_frame, book);
    return get_not_flat_account_tokens (token, imap_frame, book);
}

#define threshold (.90 * PROBABILITY_FACTOR) /* 90% */

/** Look up an Account in the map */
Account* gnc_imap_find_account_bayes(GncImportMatchMap *imap, GList *tokens)
{
    GList *current_token;		        /**< pointer to the current token from the
				         * input GList *tokens */
    GList *current_account_token;		/**< pointer to the struct
					 * account_token_count */
    struct account_token_count *account_c; /**< an account name and the number
					  * of times a token has appeared
					  * for the account */
    struct account_probability *account_p; /**< intermediate storage of values
					  * to compute the bayes probability
					  * of an account */
    GHashTable *running_probabilities = g_hash_table_new(g_str_hash, g_str_equal);
    GHashTable *final_probabilities = g_hash_table_new(g_str_hash, g_str_equal);
    struct account_info account_i;
    kvp_value* value;
    kvp_frame* token_frame;
    /* if guid flat bayes, all records should be in flat guid storage*/
    gboolean flat_bayes = gnc_features_check_used (imap->book, GNC_FEATURE_GUID_FLAT_BAYESIAN);

    ENTER(" ");

    /* check to see if the imap is NULL */
    if (!imap)
    {
        PINFO("imap is null, returning null");
        LEAVE(" ");
        return NULL;
    }

    /* find the probability for each account that contains any of the tokens
     * in the input tokens list
     */
    for (current_token = tokens; current_token; current_token = current_token->next)
    {
        struct token_accounts_info * account_tokens = get_account_tokens ((char const *) current_token->data,
                imap->frame, imap->book, flat_bayes);

        if (!account_tokens)
            continue;
        /* for each account we have just found, see if the account already exists
         * in the list of account probabilities, if not add it
         */
        for (current_account_token = account_tokens->accounts; current_account_token;
                current_account_token = current_account_token->next)
        {
            /* get the account name and corresponding token count */
            account_c = (struct account_token_count*)current_account_token->data;

            PINFO("account_c->account_guid('%s'), "
                  "account_c->token_count('%ld')/total_count('%ld')",
                  account_c->account_guid, (long)account_c->token_count,
                  (long)account_tokens->total_count);

            account_p = g_hash_table_lookup(running_probabilities,
                                            account_c->account_guid);

            /* if the account exists in the list then continue
             * the running probablities
             */
            if (account_p)
            {
                account_p->product =
                    ((double)account_c->token_count / (double)account_tokens->total_count)
                    * account_p->product;
                account_p->product_difference =
                    ((double)1 - ((double)account_c->token_count /
                                  (double)account_tokens->total_count))
                    * account_p->product_difference;
                PINFO("product == %f, product_difference == %f",
                      account_p->product, account_p->product_difference);
                g_free (account_c->account_guid);
            }
            else
            {
                /* add a new entry */
                PINFO("adding a new entry for this account");
                account_p = (struct account_probability*)
                            g_new0(struct account_probability, 1);

                /* set the product and product difference values */
                account_p->product = ((double)account_c->token_count /
                                      (double)account_tokens->total_count);
                account_p->product_difference =
                    (double)1 - ((double)account_c->token_count /
                                 (double)account_tokens->total_count);

                PINFO("product == %f, product_difference == %f",
                      account_p->product, account_p->product_difference);

                /* add the account name and (struct account_probability*)
                 * to the hash table */
                g_hash_table_insert(running_probabilities,
                                    (char *)account_c->account_guid, account_p);
            }
        } /* for all accounts in tokenInfo */

        /* free the data in tokenInfo */
        for (current_account_token = account_tokens->accounts; current_account_token;
                current_account_token = current_account_token->next)
        {
            g_free((struct account_token_count*)current_account_token->data);
        }

        g_list_free(account_tokens->accounts);
        g_free(account_tokens);
    }

    /* build a hash table of account names and their final probabilities
     * from each entry in the running_probabilties hash table
     */
    g_hash_table_foreach(running_probabilities, buildProbabilities,
                         final_probabilities);

    /* find the highest probabilty and the corresponding account */
    memset(&account_i, 0, sizeof(struct account_info));
    g_hash_table_foreach(final_probabilities, highestProbability, &account_i);

    /* free each element of the running_probabilities hash */
    g_hash_table_foreach(running_probabilities, freeProbabilities, NULL);

    /* free the hash tables */
    g_hash_table_destroy(running_probabilities);
    g_hash_table_destroy(final_probabilities);

    PINFO("highest P('%s') = '%d'",
          account_i.account_guid ? account_i.account_guid : "(null)",
          account_i.probability);

    /* has this probability met our threshold? */
    if (account_i.probability >= threshold)
    {
        Account *account = NULL;
        PINFO("Probability has met threshold");

        account = gnc_account_lookup_by_full_name(gnc_book_get_root_account(imap->book),
                                               account_i.account_guid);

        if (account == NULL) // Possibly we have a Guid or account not found
        {
            GncGUID *guid = g_new (GncGUID, 1);

            if (string_to_guid (account_i.account_guid, guid))
                account = xaccAccountLookup (guid, imap->book);

            g_free (guid);
        }

        if (account != NULL)
            LEAVE("Return account is '%s'", xaccAccountGetName (account));
        else
            LEAVE("Return NULL, account for string '%s' can not be found", account_i.account_guid);

        return account;
    }
    PINFO("Probability has not met threshold");
    LEAVE("Return NULL");

    return NULL; /* we didn't meet our threshold, return NULL for an account */
}

/** Updates the imap for a given account using a list of tokens */
void gnc_imap_add_account_bayes(GncImportMatchMap *imap, GList *tokens, Account *acc)
{
    GList *current_token;
    kvp_value *value;
    gint64 token_count;
    char* account_fullname;
    kvp_value *new_value; /* the value that will be added back into the kvp tree */
    const gchar *guid_string;
    gchar *guid_path;
    gboolean guid_bayes = gnc_features_check_used (imap->book, GNC_FEATURE_GUID_BAYESIAN);
    gboolean flat_bayes = gnc_features_check_used (imap->book, GNC_FEATURE_GUID_FLAT_BAYESIAN);

    ENTER(" ");

    /* if imap is null return */
    if (!imap)
    {
        LEAVE(" ");
        return;
    }

    g_return_if_fail (acc != NULL);
    account_fullname = gnc_account_get_full_name(acc);
    xaccAccountBeginEdit (imap->acc);

    PINFO("account name: '%s'\n", account_fullname);

    guid_string = guid_to_string (xaccAccountGetGUID (acc));

    /* process each token in the list */
    for (current_token = g_list_first(tokens); current_token;
            current_token = current_token->next)
    {
        gchar const * name_or_guid;
        gchar * flat_path = NULL;
        /* Jump to next iteration if the pointer is not valid or if the
        	 string is empty. In HBCI import we almost always get an empty
        	 string, which doesn't work in the kvp loopkup later. So we
        	 skip this case here. */
        if (!current_token->data || (*((char*)current_token->data) == '\0'))
            continue;

        /* start off with no tokens for this account */
        token_count = 0;

        PINFO("adding token '%s'\n", (char*)current_token->data);

        if (flat_bayes)
        {
            flat_path = g_strdup_printf ("%s/%s/%s", IMAP_FRAME_BAYES,
                    (char*)current_token->data, guid_string);
            value = kvp_frame_get_slot(imap->frame, flat_path);
        }
        else
        {
            if (guid_bayes)
                name_or_guid = guid_string;
            else
                name_or_guid = account_fullname;
            value = kvp_frame_get_slot_path(imap->frame, IMAP_FRAME_BAYES,
                                            (char*)current_token->data, name_or_guid,
                                            NULL);
        }
        if (value)
        {
            PINFO("found existing value of '%ld'\n",
                  (long)kvp_value_get_gint64(value));
            token_count += kvp_value_get_gint64(value);
        }
        token_count++;
        new_value = kvp_value_new_gint64(token_count);
        if (flat_bayes)
        {
            kvp_frame_set_slot(imap->frame, flat_path, new_value);
            g_free (flat_path);
        }
        else
        {
            kvp_frame_set_slot_path(imap->frame, new_value, IMAP_FRAME_BAYES,
                                        (char*)current_token->data, name_or_guid, NULL);
        }
        /* kvp_frame_set_slot_path() copied the value so we
         * need to delete this one ;-) */
        kvp_value_delete(new_value);
    }

    /* free up the account fullname string */
    qof_instance_set_dirty (QOF_INSTANCE (imap->acc));
    xaccAccountCommitEdit (imap->acc);
    g_free(account_fullname);

    LEAVE(" ");
}

/** @} */
