#include "invertedIndex.h" #include "fileList.h" #include "invertedIndexTree.h" #include "io.h" // Removes leading and trailing spaces, converts all characters to lowercase // and removes the punctutation marks .,;? if they end the string. char *normaliseWord(char *str) { char *front = str; for (int i = 0; str[i] != '\0'; ++i) { if (!isspace(str[i])) { front = &str[i]; break; } } char *end = str; for (int i = 0; front[i] != '\0'; ++i) { front[i] = (char)tolower(front[i]); if (front[i + 1] == '\0') { end = &front[i + 1]; switch (front[i]) { case '.': case ',': case ';': case '?': front[i] = '\0'; --end; } } } return memmove(str, front, (unsigned long)(end - front + 1)); } // Opens the collection file, generates an inverted index from those files // listed in the collection file. Returns a generated inverted index. struct InvertedIndexNode *generateInvertedIndex(char *collectionFilename) { FILE *f_fptr = fopen(collectionFilename, "r"); if (!f_fptr) { fprintf(stderr, "fopen(%s, \"r\")\" in generateInvertedIndex returned" " NULL!\n", collectionFilename); return NULL; } // Read file for filenames, read those files for words, insert those words // in an inverted index. Duplicates words increment tf by one for that // file. struct InvertedIndexNode *index = createInvertedIndexNode(NULL); char *filename = NULL; while ((filename = getNextWord(f_fptr))) { FILE *w_fptr = fopen(filename, "r"); if (!w_fptr) { fprintf(stderr, "fopen(%s, \"r\") in generateInvertedIndex" " returned NULL!\n", filename); return NULL; } char *word = NULL; int word_count = 0; // Two loops are required to get the total amount of words required in // calculating tf. while ((word = getNextWord(w_fptr))) { ++word_count; } rewind(w_fptr); while ((word = getNextWord(w_fptr))) { word = normaliseWord(word); // Base case where index is empty, handle manually. if (index->word == NULL) { index->word = word; } insertInvertedIndexNode(index, word); struct InvertedIndexNode *i_node = searchInvertedIndexNode(index, word); struct FileListNode *f_node = insertFileList(i_node, filename); // There may be multiple words in the same file, so only write to // tf if the word does not exist again in the same file after that // point. if (!exists_after(w_fptr, word)) { f_node->tf /= word_count; } } fclose(w_fptr); } fclose(f_fptr); return index; } // Outputs the given inverted index to a file named invertedIndex.txt. // One line per word, alphabetical, ascending order. void printInvertedIndex(struct InvertedIndexNode *node) { if (!node) { return; } FILE *fptr = fopen("invertedIndex.txt", "w"); if (fptr == NULL) { fprintf(stderr, "fopen(%s, \"r\") in printInvertedIndex" " returned NULL!\n", "invertedIndex.txt"); return; } writeInvertedIndex(fptr, node); fclose(fptr); } // Returns an ordered list where each node contains a filename and the // corresponding tf-idf value for a given searchWord. You only need to // include documents (files) that contain the given searchWord. The list // must be in descending order of tf-idf value. If there are multiple // files with same tf-idf, order them by their filename in ascending // order. D is the total number of documents in the collection. struct TfIdfNode *calculateTfIdf(struct InvertedIndexNode *tree, char *searchWord, int D) { struct InvertedIndexNode *i_node = searchInvertedIndexNode(tree, searchWord); struct FileListNode *f_node = i_node->fileList; int f_list_size = 0; for (struct FileListNode *i = f_node; i; i = i->next) { ++f_list_size; } double idf = log10((double)D / (double)f_list_size); struct FileListNode *ret = NULL; // tfidf is idf * tf for (struct FileListNode *i = f_node; i; i = i->next) { ret = insertFileOrderedList(ret, i, idf); } // Fortunately, the contents of the two structs are the same. We can just // cast the pointer to the correct return type, and return that. return (struct TfIdfNode *)ret; } // Returns an ordered list where each node contains a filename and the // summation of tf-idf values of all the matching search words for that // file. You only need to include documents (files) that contain one or // more of the given search words. The list must be in descending order // of summation of tf-idf values (tfIdfSum). If there are multiple files // with the same tf-idf sum, order them by their filename in ascending // order. D is the total number of documents in the collection. struct TfIdfNode *retrieve(InvertedIndexBST tree, char *searchWords[], int D) { struct FileListNode *ret = NULL; for (char **i = searchWords; *i; ++i) { struct FileListNode *node = (struct FileListNode *)calculateTfIdf(tree, *i, D); for (struct FileListNode *f_i = node; f_i; f_i = f_i->next) { ret = insertFileOrderedList(ret, f_i, 1.0); } } return (struct TfIdfNode *)ret; }