comp2521/tf_idf/invertedIndex.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153

#include "invertedIndex.h"
#include "fileList.h"
#include "invertedIndexTree.h"
#include "io.h"

// Removes leading and trailing spaces, converts all characters to lowercase
// and removes the punctutation marks .,;? if they end the string.
char *normaliseWord(char *str) {
    char *front = str;
    for (int i = 0; str[i] != '\0'; ++i) {
        if (!isspace(str[i])) {
            front = &str[i];
            break;
        }
    }
    char *end = str;
    for (int i = 0; front[i] != '\0'; ++i) {
        front[i] = (char)tolower(front[i]);
        if (front[i + 1] == '\0') {
            end = &front[i + 1];
            switch (front[i]) {
            case '.':
            case ',':
            case ';':
            case '?':
                front[i] = '\0';
                --end;
            }
        }
    }
    return memmove(str, front, (unsigned long)(end - front + 1));
}

// Opens the collection file, generates an inverted index from those files
// listed in the collection file. Returns a generated inverted index.
struct InvertedIndexNode *generateInvertedIndex(char *collectionFilename) {
    FILE *f_fptr = fopen(collectionFilename, "r");
    if (!f_fptr) {
        fprintf(stderr,
                "fopen(%s, \"r\")\" in generateInvertedIndex returned"
                " NULL!\n",
                collectionFilename);
        return NULL;
    }
    // Read file for filenames, read those files for words, insert those words
    // in an inverted index. Duplicates words increment tf by one for that
    // file.
    struct InvertedIndexNode *index = createInvertedIndexNode(NULL);
    char *filename = NULL;
    while ((filename = getNextWord(f_fptr))) {
        FILE *w_fptr = fopen(filename, "r");
        if (!w_fptr) {
            fprintf(stderr,
                    "fopen(%s, \"r\") in generateInvertedIndex"
                    " returned NULL!\n",
                    filename);
            return NULL;
        }
        char *word = NULL;
        int word_count = 0;
        // Two loops are required to get the total amount of words required in
        // calculating tf.
        while ((word = getNextWord(w_fptr))) {
            ++word_count;
        }
        rewind(w_fptr);

        while ((word = getNextWord(w_fptr))) {
            word = normaliseWord(word);
            // Base case where index is empty, handle manually.
            if (index->word == NULL) {
                index->word = word;
            }
            insertInvertedIndexNode(index, word);
            struct InvertedIndexNode *i_node =
                searchInvertedIndexNode(index, word);
            struct FileListNode *f_node = insertFileList(i_node, filename);
            // There may be multiple words in the same file, so only write to
            // tf if the word does not exist again in the same file after that
            // point.
            if (!exists_after(w_fptr, word)) {
                f_node->tf /= word_count;
            }
        }
        fclose(w_fptr);
    }
    fclose(f_fptr);
    return index;
}

// Outputs the given inverted index to a file named invertedIndex.txt.
// One line per word, alphabetical, ascending order.
void printInvertedIndex(struct InvertedIndexNode *node) {
    if (!node) {
        return;
    }
    FILE *fptr = fopen("invertedIndex.txt", "w");
    if (fptr == NULL) {
        fprintf(stderr,
                "fopen(%s, \"r\") in printInvertedIndex"
                " returned NULL!\n",
                "invertedIndex.txt");
        return;
    }
    writeInvertedIndex(fptr, node);
    fclose(fptr);
}

// Returns  an  ordered list where each node contains a filename and the
// corresponding tf-idf value for a given searchWord. You only  need  to
// include documents (files) that contain the given searchWord. The list
// must  be  in  descending order of tf-idf value. If there are multiple
// files with same tf-idf, order them by  their  filename  in  ascending
// order. D is the total number of documents in the collection.
struct TfIdfNode *calculateTfIdf(struct InvertedIndexNode *tree,
                                 char *searchWord, int D) {
    struct InvertedIndexNode *i_node =
        searchInvertedIndexNode(tree, searchWord);
    struct FileListNode *f_node = i_node->fileList;
    int f_list_size = 0;
    for (struct FileListNode *i = f_node; i; i = i->next) {
        ++f_list_size;
    }
    double idf = log10((double)D / (double)f_list_size);
    struct FileListNode *ret = NULL;
    // tfidf is idf * tf
    for (struct FileListNode *i = f_node; i; i = i->next) {
        ret = insertFileOrderedList(ret, i, idf);
    }
    // Fortunately, the contents of the two structs are the same. We can just
    // cast the pointer to the correct return type, and return that.
    return (struct TfIdfNode *)ret;
}

// Returns  an  ordered list where each node contains a filename and the
// summation of tf-idf values of all the matching search words for  that
// file.  You only need to include documents (files) that contain one or
// more of the given search words. The list must be in descending  order
// of summation of tf-idf values (tfIdfSum). If there are multiple files
// with  the  same tf-idf sum, order them by their filename in ascending
// order. D is the total number of documents in the collection.
struct TfIdfNode *retrieve(InvertedIndexBST tree, char *searchWords[], int D) {
    struct FileListNode *ret = NULL;
    for (char **i = searchWords; *i; ++i) {
        struct FileListNode *node =
            (struct FileListNode *)calculateTfIdf(tree, *i, D);

        for (struct FileListNode *f_i = node; f_i; f_i = f_i->next) {
            ret = insertFileOrderedList(ret, f_i, 1.0);
        }
    }
    return (struct TfIdfNode *)ret;
}