#include "invertedIndex.h"
#include "fileList.h"
#include "invertedIndexTree.h"
#include "io.h"

// Removes leading and trailing spaces, converts all characters to lowercase
// and removes the punctutation marks .,;? if they end the string.
char *normaliseWord(char *str) {
    char *front = str;
    for (int i = 0; str[i] != '\0'; ++i) {
        if (!isspace(str[i])) {
            front = &str[i];
            break;
        }
    }
    char *end = str;
    for (int i = 0; front[i] != '\0'; ++i) {
        front[i] = (char)tolower(front[i]);
        if (front[i + 1] == '\0') {
            end = &front[i + 1];
            switch (front[i]) {
            case '.':
            case ',':
            case ';':
            case '?':
                front[i] = '\0';
                --end;
            }
        }
    }
    return memmove(str, front, (unsigned long)(end - front + 1));
}

// Opens the collection file, generates an inverted index from those files
// listed in the collection file. Returns a generated inverted index.
struct InvertedIndexNode *generateInvertedIndex(char *collectionFilename) {
    FILE *f_fptr = fopen(collectionFilename, "r");
    if (!f_fptr) {
        fprintf(stderr,
                "fopen(%s, \"r\")\" in generateInvertedIndex returned"
                " NULL!\n",
                collectionFilename);
        return NULL;
    }
    // Read file for filenames, read those files for words, insert those words
    // in an inverted index. Duplicates words increment tf by one for that
    // file.
    struct InvertedIndexNode *index = createInvertedIndexNode(NULL);
    char *filename = NULL;
    while ((filename = getNextWord(f_fptr))) {
        FILE *w_fptr = fopen(filename, "r");
        if (!w_fptr) {
            fprintf(stderr,
                    "fopen(%s, \"r\") in generateInvertedIndex"
                    " returned NULL!\n",
                    filename);
            return NULL;
        }
        char *word = NULL;
        int word_count = 0;
        // Two loops are required to get the total amount of words required in
        // calculating tf.
        while ((word = getNextWord(w_fptr))) {
            ++word_count;
        }
        rewind(w_fptr);

        while ((word = getNextWord(w_fptr))) {
            word = normaliseWord(word);
            // Base case where index is empty, handle manually.
            if (index->word == NULL) {
                index->word = word;
            }
            insertInvertedIndexNode(index, word);
            struct InvertedIndexNode *i_node =
                searchInvertedIndexNode(index, word);
            struct FileListNode *f_node = insertFileList(i_node, filename);
            // There may be multiple words in the same file, so only write to
            // tf if the word does not exist again in the same file after that
            // point.
            if (!exists_after(w_fptr, word)) {
                f_node->tf /= word_count;
            }
        }
        fclose(w_fptr);
    }
    fclose(f_fptr);
    return index;
}

// Outputs the given inverted index to a file named invertedIndex.txt.
// One line per word, alphabetical, ascending order.
void printInvertedIndex(struct InvertedIndexNode *node) {
    if (!node) {
        return;
    }
    FILE *fptr = fopen("invertedIndex.txt", "w");
    if (fptr == NULL) {
        fprintf(stderr,
                "fopen(%s, \"r\") in printInvertedIndex"
                " returned NULL!\n",
                "invertedIndex.txt");
        return;
    }
    writeInvertedIndex(fptr, node);
    fclose(fptr);
}

// Returns  an  ordered list where each node contains a filename and the
// corresponding tf-idf value for a given searchWord. You only  need  to
// include documents (files) that contain the given searchWord. The list
// must  be  in  descending order of tf-idf value. If there are multiple
// files with same tf-idf, order them by  their  filename  in  ascending
// order. D is the total number of documents in the collection.
struct TfIdfNode *calculateTfIdf(struct InvertedIndexNode *tree,
                                 char *searchWord, int D) {
    struct InvertedIndexNode *i_node =
        searchInvertedIndexNode(tree, searchWord);
    struct FileListNode *f_node = i_node->fileList;
    int f_list_size = 0;
    for (struct FileListNode *i = f_node; i; i = i->next) {
        ++f_list_size;
    }
    double idf = log10((double)D / (double)f_list_size);
    struct FileListNode *ret = NULL;
    // tfidf is idf * tf
    for (struct FileListNode *i = f_node; i; i = i->next) {
        ret = insertFileOrderedList(ret, i, idf);
    }
    // Fortunately, the contents of the two structs are the same. We can just
    // cast the pointer to the correct return type, and return that.
    return (struct TfIdfNode *)ret;
}

// Returns  an  ordered list where each node contains a filename and the
// summation of tf-idf values of all the matching search words for  that
// file.  You only need to include documents (files) that contain one or
// more of the given search words. The list must be in descending  order
// of summation of tf-idf values (tfIdfSum). If there are multiple files
// with  the  same tf-idf sum, order them by their filename in ascending
// order. D is the total number of documents in the collection.
struct TfIdfNode *retrieve(InvertedIndexBST tree, char *searchWords[], int D) {
    struct FileListNode *ret = NULL;
    for (char **i = searchWords; *i; ++i) {
        struct FileListNode *node =
            (struct FileListNode *)calculateTfIdf(tree, *i, D);

        for (struct FileListNode *f_i = node; f_i; f_i = f_i->next) {
            ret = insertFileOrderedList(ret, f_i, 1.0);
        }
    }
    return (struct TfIdfNode *)ret;
}