1 files changed, 78 insertions, 0 deletions
diff --git a/comp2521/tf_idf/invertedIndex.h b/comp2521/tf_idf/invertedIndex.h
new file mode 100644
index 0000000..ddf9932
--- /dev/null
+++ b/comp2521/tf_idf/invertedIndex.h
@@ -0,0 +1,78 @@
+// COMP2521 Assignment 1
+// DO NOT MODIFY THIS FILE
+
+#ifndef _INVERTEDINDEX_GUARD
+#define _INVERTEDINDEX_GUARD
+
+struct FileListNode {
+	char *filename;
+	double tf; // relative tf
+	struct FileListNode *next;
+};
+typedef struct FileListNode *FileList;
+
+struct InvertedIndexNode {
+	char *word; // key
+	struct FileListNode *fileList;
+	struct InvertedIndexNode *left;
+	struct InvertedIndexNode *right;
+
+};
+typedef struct InvertedIndexNode *InvertedIndexBST;
+
+struct TfIdfNode {
+	char *filename;
+	double tfIdfSum; // tfidf sum value
+	struct TfIdfNode *next;
+};
+typedef struct TfIdfNode *TfIdfList;
+
+// Functions for Part 1
+
+/**
+ * Normalises a given string. See the spec for details. Note: you should
+ * modify the given string - do not create a copy of it.
+ */
+char *normaliseWord(char *str);
+
+/**
+ * This function opens the collection file with the given name, and then
+ * generates an inverted index from those files listed in the collection
+ * file,  as  discussed  in  the spec. It returns the generated inverted
+ * index.
+ */
+InvertedIndexBST generateInvertedIndex(char *collectionFilename);
+
+/**
+ * Outputs  the  given inverted index to a file named invertedIndex.txt.
+ * The output should contain one line per word, with the  words  ordered
+ * alphabetically  in ascending order. Each list of filenames for a word
+ * should be ordered alphabetically in ascending order.
+*/
+void printInvertedIndex(InvertedIndexBST tree); 
+
+// Functions for Part-2
+
+/**
+ * Returns  an  ordered list where each node contains a filename and the 
+ * corresponding tf-idf value for a given searchWord. You only  need  to
+ * include documents (files) that contain the given searchWord. The list
+ * must  be  in  descending order of tf-idf value. If there are multiple
+ * files with same tf-idf, order them by  their  filename  in  ascending
+ * order. D is the total number of documents in the collection.
+ */
+TfIdfList calculateTfIdf(InvertedIndexBST tree, char *searchWord, int D);
+
+/**
+ * Returns  an  ordered list where each node contains a filename and the
+ * summation of tf-idf values of all the matching search words for  that
+ * file.  You only need to include documents (files) that contain one or
+ * more of the given search words. The list must be in descending  order
+ * of summation of tf-idf values (tfIdfSum). If there are multiple files
+ * with  the  same tf-idf sum, order them by their filename in ascending
+ * order. D is the total number of documents in the collection.
+ */
+TfIdfList retrieve(InvertedIndexBST tree, char *searchWords[], int D);
+
+#endif
+