diff options
Diffstat (limited to 'comp2521/tf_idf/invertedIndex.h')
| -rw-r--r-- | comp2521/tf_idf/invertedIndex.h | 78 |
1 files changed, 78 insertions, 0 deletions
diff --git a/comp2521/tf_idf/invertedIndex.h b/comp2521/tf_idf/invertedIndex.h new file mode 100644 index 0000000..ddf9932 --- /dev/null +++ b/comp2521/tf_idf/invertedIndex.h @@ -0,0 +1,78 @@ +// COMP2521 Assignment 1 +// DO NOT MODIFY THIS FILE + +#ifndef _INVERTEDINDEX_GUARD +#define _INVERTEDINDEX_GUARD + +struct FileListNode { + char *filename; + double tf; // relative tf + struct FileListNode *next; +}; +typedef struct FileListNode *FileList; + +struct InvertedIndexNode { + char *word; // key + struct FileListNode *fileList; + struct InvertedIndexNode *left; + struct InvertedIndexNode *right; + +}; +typedef struct InvertedIndexNode *InvertedIndexBST; + +struct TfIdfNode { + char *filename; + double tfIdfSum; // tfidf sum value + struct TfIdfNode *next; +}; +typedef struct TfIdfNode *TfIdfList; + +// Functions for Part 1 + +/** + * Normalises a given string. See the spec for details. Note: you should + * modify the given string - do not create a copy of it. + */ +char *normaliseWord(char *str); + +/** + * This function opens the collection file with the given name, and then + * generates an inverted index from those files listed in the collection + * file, as discussed in the spec. It returns the generated inverted + * index. + */ +InvertedIndexBST generateInvertedIndex(char *collectionFilename); + +/** + * Outputs the given inverted index to a file named invertedIndex.txt. + * The output should contain one line per word, with the words ordered + * alphabetically in ascending order. Each list of filenames for a word + * should be ordered alphabetically in ascending order. +*/ +void printInvertedIndex(InvertedIndexBST tree); + +// Functions for Part-2 + +/** + * Returns an ordered list where each node contains a filename and the + * corresponding tf-idf value for a given searchWord. You only need to + * include documents (files) that contain the given searchWord. The list + * must be in descending order of tf-idf value. If there are multiple + * files with same tf-idf, order them by their filename in ascending + * order. D is the total number of documents in the collection. + */ +TfIdfList calculateTfIdf(InvertedIndexBST tree, char *searchWord, int D); + +/** + * Returns an ordered list where each node contains a filename and the + * summation of tf-idf values of all the matching search words for that + * file. You only need to include documents (files) that contain one or + * more of the given search words. The list must be in descending order + * of summation of tf-idf values (tfIdfSum). If there are multiple files + * with the same tf-idf sum, order them by their filename in ascending + * order. D is the total number of documents in the collection. + */ +TfIdfList retrieve(InvertedIndexBST tree, char *searchWords[], int D); + +#endif + |
