diff options
Diffstat (limited to 'comp2521/tf_idf/fileList.c')
| -rw-r--r-- | comp2521/tf_idf/fileList.c | 118 |
1 files changed, 118 insertions, 0 deletions
diff --git a/comp2521/tf_idf/fileList.c b/comp2521/tf_idf/fileList.c new file mode 100644 index 0000000..53dc0f9 --- /dev/null +++ b/comp2521/tf_idf/fileList.c @@ -0,0 +1,118 @@ +#include "fileList.h" + +// Returns a newly malloc'ed file list node, with word initialised to word +// and tf set to 1. +struct FileListNode *createFileListNode(char *const word) { + struct FileListNode *ret = calloc(1, sizeof(struct FileListNode)); + ret->filename = word; + ret->tf = 1; + return ret; +} + +// Inserts a fileListNode into an InvertedIndexNode->fileList based off a +// string in alphabetical order. Increments tf by 1 already exists. +// Returns new or preexisting fileListNode *. +struct FileListNode *insertFileList(struct InvertedIndexNode *node, + char *const filename) { + struct FileListNode *ret = createFileListNode(filename); + // Empty case. + if (!node->fileList) { + node->fileList = ret; + return ret; + } + // Front case. + int comparison = strcmp(filename, node->fileList->filename); + if (comparison < 0) { + ret->next = node->fileList; + node->fileList = ret; + return ret; + } else if (comparison == 0) { + ++node->fileList->tf; + free(ret); + return node->fileList; + } + // Middle case. + struct FileListNode *last = node->fileList; + for (struct FileListNode *i = node->fileList->next; i; i = i->next) { + int comp = strcmp(filename, i->filename); + if (comp < 0) { + last->next = ret; + ret->next = i; + return ret; + } else if (comp == 0) { + ++i->tf; + free(ret); + return i; + } + last = i; + } + // Back case. + last->next = ret; + return ret; +} + +// Inserts a fileListNode into a list via ordering of tf values. If the tf +// values are equal, inserts based off the filename in ascending order. +// Returns the new head of the list. +struct FileListNode *insertFileOrderedList(struct FileListNode *head, + struct FileListNode *insert, + double mult) { + struct FileListNode *ret = calloc(1, sizeof(struct FileListNode)); + ret->filename = strdup(insert->filename); + ret->tf = insert->tf * mult; + // Empty case. + if (!head) { + return ret; + } + + struct FileListNode *prev_duplicate = NULL; + struct FileListNode *duplicate = NULL; + for (struct FileListNode *i = head; i; i = i->next) { + if (!strcmp(ret->filename, i->filename)) { + duplicate = i; + break; + } + prev_duplicate = i; + } + + if (duplicate != NULL) { + if (prev_duplicate != NULL) { + prev_duplicate->next = duplicate->next; + } else { + head = duplicate->next; + } + duplicate->tf += ret->tf; + free(ret->filename); + free(ret); + return insertFileOrderedList(head, duplicate, mult); + } + + struct FileListNode *last = NULL; + for (struct FileListNode *i = head; i; i = i->next) { + const double scmp = strcmp(ret->filename, i->filename); + + if (ret->tf > i->tf) { + break; + } else if (insert->tf == i->tf) { + if (scmp <= 0) { + break; + } else { + last = i; + } + } else { + last = i; + } + } + + // Front case. + if (!last) { + ret->next = head; + return ret; + } + + // Middle or end case. + ret->next = last->next; + last->next = ret; + + return head; +} |
