diff options
Diffstat (limited to 'comp2521/tf_idf/invertedIndex.c')
| -rw-r--r-- | comp2521/tf_idf/invertedIndex.c | 153 |
1 files changed, 153 insertions, 0 deletions
diff --git a/comp2521/tf_idf/invertedIndex.c b/comp2521/tf_idf/invertedIndex.c new file mode 100644 index 0000000..e3c21a1 --- /dev/null +++ b/comp2521/tf_idf/invertedIndex.c @@ -0,0 +1,153 @@ +#include "invertedIndex.h" +#include "fileList.h" +#include "invertedIndexTree.h" +#include "io.h" + +// Removes leading and trailing spaces, converts all characters to lowercase +// and removes the punctutation marks .,;? if they end the string. +char *normaliseWord(char *str) { + char *front = str; + for (int i = 0; str[i] != '\0'; ++i) { + if (!isspace(str[i])) { + front = &str[i]; + break; + } + } + char *end = str; + for (int i = 0; front[i] != '\0'; ++i) { + front[i] = (char)tolower(front[i]); + if (front[i + 1] == '\0') { + end = &front[i + 1]; + switch (front[i]) { + case '.': + case ',': + case ';': + case '?': + front[i] = '\0'; + --end; + } + } + } + return memmove(str, front, (unsigned long)(end - front + 1)); +} + +// Opens the collection file, generates an inverted index from those files +// listed in the collection file. Returns a generated inverted index. +struct InvertedIndexNode *generateInvertedIndex(char *collectionFilename) { + FILE *f_fptr = fopen(collectionFilename, "r"); + if (!f_fptr) { + fprintf(stderr, + "fopen(%s, \"r\")\" in generateInvertedIndex returned" + " NULL!\n", + collectionFilename); + return NULL; + } + // Read file for filenames, read those files for words, insert those words + // in an inverted index. Duplicates words increment tf by one for that + // file. + struct InvertedIndexNode *index = createInvertedIndexNode(NULL); + char *filename = NULL; + while ((filename = getNextWord(f_fptr))) { + FILE *w_fptr = fopen(filename, "r"); + if (!w_fptr) { + fprintf(stderr, + "fopen(%s, \"r\") in generateInvertedIndex" + " returned NULL!\n", + filename); + return NULL; + } + char *word = NULL; + int word_count = 0; + // Two loops are required to get the total amount of words required in + // calculating tf. + while ((word = getNextWord(w_fptr))) { + ++word_count; + } + rewind(w_fptr); + + while ((word = getNextWord(w_fptr))) { + word = normaliseWord(word); + // Base case where index is empty, handle manually. + if (index->word == NULL) { + index->word = word; + } + insertInvertedIndexNode(index, word); + struct InvertedIndexNode *i_node = + searchInvertedIndexNode(index, word); + struct FileListNode *f_node = insertFileList(i_node, filename); + // There may be multiple words in the same file, so only write to + // tf if the word does not exist again in the same file after that + // point. + if (!exists_after(w_fptr, word)) { + f_node->tf /= word_count; + } + } + fclose(w_fptr); + } + fclose(f_fptr); + return index; +} + +// Outputs the given inverted index to a file named invertedIndex.txt. +// One line per word, alphabetical, ascending order. +void printInvertedIndex(struct InvertedIndexNode *node) { + if (!node) { + return; + } + FILE *fptr = fopen("invertedIndex.txt", "w"); + if (fptr == NULL) { + fprintf(stderr, + "fopen(%s, \"r\") in printInvertedIndex" + " returned NULL!\n", + "invertedIndex.txt"); + return; + } + writeInvertedIndex(fptr, node); + fclose(fptr); +} + +// Returns an ordered list where each node contains a filename and the +// corresponding tf-idf value for a given searchWord. You only need to +// include documents (files) that contain the given searchWord. The list +// must be in descending order of tf-idf value. If there are multiple +// files with same tf-idf, order them by their filename in ascending +// order. D is the total number of documents in the collection. +struct TfIdfNode *calculateTfIdf(struct InvertedIndexNode *tree, + char *searchWord, int D) { + struct InvertedIndexNode *i_node = + searchInvertedIndexNode(tree, searchWord); + struct FileListNode *f_node = i_node->fileList; + int f_list_size = 0; + for (struct FileListNode *i = f_node; i; i = i->next) { + ++f_list_size; + } + double idf = log10((double)D / (double)f_list_size); + struct FileListNode *ret = NULL; + // tfidf is idf * tf + for (struct FileListNode *i = f_node; i; i = i->next) { + ret = insertFileOrderedList(ret, i, idf); + } + // Fortunately, the contents of the two structs are the same. We can just + // cast the pointer to the correct return type, and return that. + return (struct TfIdfNode *)ret; +} + +// Returns an ordered list where each node contains a filename and the +// summation of tf-idf values of all the matching search words for that +// file. You only need to include documents (files) that contain one or +// more of the given search words. The list must be in descending order +// of summation of tf-idf values (tfIdfSum). If there are multiple files +// with the same tf-idf sum, order them by their filename in ascending +// order. D is the total number of documents in the collection. +struct TfIdfNode *retrieve(InvertedIndexBST tree, char *searchWords[], int D) { + struct FileListNode *ret = NULL; + for (char **i = searchWords; *i; ++i) { + struct FileListNode *node = + (struct FileListNode *)calculateTfIdf(tree, *i, D); + + for (struct FileListNode *f_i = node; f_i; f_i = f_i->next) { + ret = insertFileOrderedList(ret, f_i, 1.0); + } + } + return (struct TfIdfNode *)ret; +} |
