aboutsummaryrefslogtreecommitdiff
path: root/comp2521/tf_idf/invertedIndex.h
diff options
context:
space:
mode:
Diffstat (limited to 'comp2521/tf_idf/invertedIndex.h')
-rw-r--r--comp2521/tf_idf/invertedIndex.h78
1 files changed, 78 insertions, 0 deletions
diff --git a/comp2521/tf_idf/invertedIndex.h b/comp2521/tf_idf/invertedIndex.h
new file mode 100644
index 0000000..ddf9932
--- /dev/null
+++ b/comp2521/tf_idf/invertedIndex.h
@@ -0,0 +1,78 @@
+// COMP2521 Assignment 1
+// DO NOT MODIFY THIS FILE
+
+#ifndef _INVERTEDINDEX_GUARD
+#define _INVERTEDINDEX_GUARD
+
+struct FileListNode {
+ char *filename;
+ double tf; // relative tf
+ struct FileListNode *next;
+};
+typedef struct FileListNode *FileList;
+
+struct InvertedIndexNode {
+ char *word; // key
+ struct FileListNode *fileList;
+ struct InvertedIndexNode *left;
+ struct InvertedIndexNode *right;
+
+};
+typedef struct InvertedIndexNode *InvertedIndexBST;
+
+struct TfIdfNode {
+ char *filename;
+ double tfIdfSum; // tfidf sum value
+ struct TfIdfNode *next;
+};
+typedef struct TfIdfNode *TfIdfList;
+
+// Functions for Part 1
+
+/**
+ * Normalises a given string. See the spec for details. Note: you should
+ * modify the given string - do not create a copy of it.
+ */
+char *normaliseWord(char *str);
+
+/**
+ * This function opens the collection file with the given name, and then
+ * generates an inverted index from those files listed in the collection
+ * file, as discussed in the spec. It returns the generated inverted
+ * index.
+ */
+InvertedIndexBST generateInvertedIndex(char *collectionFilename);
+
+/**
+ * Outputs the given inverted index to a file named invertedIndex.txt.
+ * The output should contain one line per word, with the words ordered
+ * alphabetically in ascending order. Each list of filenames for a word
+ * should be ordered alphabetically in ascending order.
+*/
+void printInvertedIndex(InvertedIndexBST tree);
+
+// Functions for Part-2
+
+/**
+ * Returns an ordered list where each node contains a filename and the
+ * corresponding tf-idf value for a given searchWord. You only need to
+ * include documents (files) that contain the given searchWord. The list
+ * must be in descending order of tf-idf value. If there are multiple
+ * files with same tf-idf, order them by their filename in ascending
+ * order. D is the total number of documents in the collection.
+ */
+TfIdfList calculateTfIdf(InvertedIndexBST tree, char *searchWord, int D);
+
+/**
+ * Returns an ordered list where each node contains a filename and the
+ * summation of tf-idf values of all the matching search words for that
+ * file. You only need to include documents (files) that contain one or
+ * more of the given search words. The list must be in descending order
+ * of summation of tf-idf values (tfIdfSum). If there are multiple files
+ * with the same tf-idf sum, order them by their filename in ascending
+ * order. D is the total number of documents in the collection.
+ */
+TfIdfList retrieve(InvertedIndexBST tree, char *searchWords[], int D);
+
+#endif
+