blob: ddf9932a32917694230dc3fa436c236ad1c7837a (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
|
// COMP2521 Assignment 1
// DO NOT MODIFY THIS FILE
#ifndef _INVERTEDINDEX_GUARD
#define _INVERTEDINDEX_GUARD
struct FileListNode {
char *filename;
double tf; // relative tf
struct FileListNode *next;
};
typedef struct FileListNode *FileList;
struct InvertedIndexNode {
char *word; // key
struct FileListNode *fileList;
struct InvertedIndexNode *left;
struct InvertedIndexNode *right;
};
typedef struct InvertedIndexNode *InvertedIndexBST;
struct TfIdfNode {
char *filename;
double tfIdfSum; // tfidf sum value
struct TfIdfNode *next;
};
typedef struct TfIdfNode *TfIdfList;
// Functions for Part 1
/**
* Normalises a given string. See the spec for details. Note: you should
* modify the given string - do not create a copy of it.
*/
char *normaliseWord(char *str);
/**
* This function opens the collection file with the given name, and then
* generates an inverted index from those files listed in the collection
* file, as discussed in the spec. It returns the generated inverted
* index.
*/
InvertedIndexBST generateInvertedIndex(char *collectionFilename);
/**
* Outputs the given inverted index to a file named invertedIndex.txt.
* The output should contain one line per word, with the words ordered
* alphabetically in ascending order. Each list of filenames for a word
* should be ordered alphabetically in ascending order.
*/
void printInvertedIndex(InvertedIndexBST tree);
// Functions for Part-2
/**
* Returns an ordered list where each node contains a filename and the
* corresponding tf-idf value for a given searchWord. You only need to
* include documents (files) that contain the given searchWord. The list
* must be in descending order of tf-idf value. If there are multiple
* files with same tf-idf, order them by their filename in ascending
* order. D is the total number of documents in the collection.
*/
TfIdfList calculateTfIdf(InvertedIndexBST tree, char *searchWord, int D);
/**
* Returns an ordered list where each node contains a filename and the
* summation of tf-idf values of all the matching search words for that
* file. You only need to include documents (files) that contain one or
* more of the given search words. The list must be in descending order
* of summation of tf-idf values (tfIdfSum). If there are multiple files
* with the same tf-idf sum, order them by their filename in ascending
* order. D is the total number of documents in the collection.
*/
TfIdfList retrieve(InvertedIndexBST tree, char *searchWords[], int D);
#endif
|