aboutsummaryrefslogtreecommitdiff
path: root/comp2521/tf_idf/invertedIndex.c
diff options
context:
space:
mode:
Diffstat (limited to 'comp2521/tf_idf/invertedIndex.c')
-rw-r--r--comp2521/tf_idf/invertedIndex.c153
1 files changed, 153 insertions, 0 deletions
diff --git a/comp2521/tf_idf/invertedIndex.c b/comp2521/tf_idf/invertedIndex.c
new file mode 100644
index 0000000..e3c21a1
--- /dev/null
+++ b/comp2521/tf_idf/invertedIndex.c
@@ -0,0 +1,153 @@
+#include "invertedIndex.h"
+#include "fileList.h"
+#include "invertedIndexTree.h"
+#include "io.h"
+
+// Removes leading and trailing spaces, converts all characters to lowercase
+// and removes the punctutation marks .,;? if they end the string.
+char *normaliseWord(char *str) {
+ char *front = str;
+ for (int i = 0; str[i] != '\0'; ++i) {
+ if (!isspace(str[i])) {
+ front = &str[i];
+ break;
+ }
+ }
+ char *end = str;
+ for (int i = 0; front[i] != '\0'; ++i) {
+ front[i] = (char)tolower(front[i]);
+ if (front[i + 1] == '\0') {
+ end = &front[i + 1];
+ switch (front[i]) {
+ case '.':
+ case ',':
+ case ';':
+ case '?':
+ front[i] = '\0';
+ --end;
+ }
+ }
+ }
+ return memmove(str, front, (unsigned long)(end - front + 1));
+}
+
+// Opens the collection file, generates an inverted index from those files
+// listed in the collection file. Returns a generated inverted index.
+struct InvertedIndexNode *generateInvertedIndex(char *collectionFilename) {
+ FILE *f_fptr = fopen(collectionFilename, "r");
+ if (!f_fptr) {
+ fprintf(stderr,
+ "fopen(%s, \"r\")\" in generateInvertedIndex returned"
+ " NULL!\n",
+ collectionFilename);
+ return NULL;
+ }
+ // Read file for filenames, read those files for words, insert those words
+ // in an inverted index. Duplicates words increment tf by one for that
+ // file.
+ struct InvertedIndexNode *index = createInvertedIndexNode(NULL);
+ char *filename = NULL;
+ while ((filename = getNextWord(f_fptr))) {
+ FILE *w_fptr = fopen(filename, "r");
+ if (!w_fptr) {
+ fprintf(stderr,
+ "fopen(%s, \"r\") in generateInvertedIndex"
+ " returned NULL!\n",
+ filename);
+ return NULL;
+ }
+ char *word = NULL;
+ int word_count = 0;
+ // Two loops are required to get the total amount of words required in
+ // calculating tf.
+ while ((word = getNextWord(w_fptr))) {
+ ++word_count;
+ }
+ rewind(w_fptr);
+
+ while ((word = getNextWord(w_fptr))) {
+ word = normaliseWord(word);
+ // Base case where index is empty, handle manually.
+ if (index->word == NULL) {
+ index->word = word;
+ }
+ insertInvertedIndexNode(index, word);
+ struct InvertedIndexNode *i_node =
+ searchInvertedIndexNode(index, word);
+ struct FileListNode *f_node = insertFileList(i_node, filename);
+ // There may be multiple words in the same file, so only write to
+ // tf if the word does not exist again in the same file after that
+ // point.
+ if (!exists_after(w_fptr, word)) {
+ f_node->tf /= word_count;
+ }
+ }
+ fclose(w_fptr);
+ }
+ fclose(f_fptr);
+ return index;
+}
+
+// Outputs the given inverted index to a file named invertedIndex.txt.
+// One line per word, alphabetical, ascending order.
+void printInvertedIndex(struct InvertedIndexNode *node) {
+ if (!node) {
+ return;
+ }
+ FILE *fptr = fopen("invertedIndex.txt", "w");
+ if (fptr == NULL) {
+ fprintf(stderr,
+ "fopen(%s, \"r\") in printInvertedIndex"
+ " returned NULL!\n",
+ "invertedIndex.txt");
+ return;
+ }
+ writeInvertedIndex(fptr, node);
+ fclose(fptr);
+}
+
+// Returns an ordered list where each node contains a filename and the
+// corresponding tf-idf value for a given searchWord. You only need to
+// include documents (files) that contain the given searchWord. The list
+// must be in descending order of tf-idf value. If there are multiple
+// files with same tf-idf, order them by their filename in ascending
+// order. D is the total number of documents in the collection.
+struct TfIdfNode *calculateTfIdf(struct InvertedIndexNode *tree,
+ char *searchWord, int D) {
+ struct InvertedIndexNode *i_node =
+ searchInvertedIndexNode(tree, searchWord);
+ struct FileListNode *f_node = i_node->fileList;
+ int f_list_size = 0;
+ for (struct FileListNode *i = f_node; i; i = i->next) {
+ ++f_list_size;
+ }
+ double idf = log10((double)D / (double)f_list_size);
+ struct FileListNode *ret = NULL;
+ // tfidf is idf * tf
+ for (struct FileListNode *i = f_node; i; i = i->next) {
+ ret = insertFileOrderedList(ret, i, idf);
+ }
+ // Fortunately, the contents of the two structs are the same. We can just
+ // cast the pointer to the correct return type, and return that.
+ return (struct TfIdfNode *)ret;
+}
+
+// Returns an ordered list where each node contains a filename and the
+// summation of tf-idf values of all the matching search words for that
+// file. You only need to include documents (files) that contain one or
+// more of the given search words. The list must be in descending order
+// of summation of tf-idf values (tfIdfSum). If there are multiple files
+// with the same tf-idf sum, order them by their filename in ascending
+// order. D is the total number of documents in the collection.
+struct TfIdfNode *retrieve(InvertedIndexBST tree, char *searchWords[], int D) {
+ struct FileListNode *ret = NULL;
+ for (char **i = searchWords; *i; ++i) {
+ struct FileListNode *node =
+ (struct FileListNode *)calculateTfIdf(tree, *i, D);
+
+ for (struct FileListNode *f_i = node; f_i; f_i = f_i->next) {
+ ret = insertFileOrderedList(ret, f_i, 1.0);
+ }
+ }
+ return (struct TfIdfNode *)ret;
+}