aboutsummaryrefslogtreecommitdiff
path: root/comp2521/tf_idf
diff options
context:
space:
mode:
Diffstat (limited to 'comp2521/tf_idf')
-rw-r--r--comp2521/tf_idf/fileList.c118
-rw-r--r--comp2521/tf_idf/fileList.h35
-rw-r--r--comp2521/tf_idf/invertedIndex.c153
-rw-r--r--comp2521/tf_idf/invertedIndex.h78
-rw-r--r--comp2521/tf_idf/invertedIndexTree.c40
-rw-r--r--comp2521/tf_idf/invertedIndexTree.h38
-rw-r--r--comp2521/tf_idf/io.c69
-rw-r--r--comp2521/tf_idf/io.h21
8 files changed, 552 insertions, 0 deletions
diff --git a/comp2521/tf_idf/fileList.c b/comp2521/tf_idf/fileList.c
new file mode 100644
index 0000000..53dc0f9
--- /dev/null
+++ b/comp2521/tf_idf/fileList.c
@@ -0,0 +1,118 @@
+#include "fileList.h"
+
+// Returns a newly malloc'ed file list node, with word initialised to word
+// and tf set to 1.
+struct FileListNode *createFileListNode(char *const word) {
+ struct FileListNode *ret = calloc(1, sizeof(struct FileListNode));
+ ret->filename = word;
+ ret->tf = 1;
+ return ret;
+}
+
+// Inserts a fileListNode into an InvertedIndexNode->fileList based off a
+// string in alphabetical order. Increments tf by 1 already exists.
+// Returns new or preexisting fileListNode *.
+struct FileListNode *insertFileList(struct InvertedIndexNode *node,
+ char *const filename) {
+ struct FileListNode *ret = createFileListNode(filename);
+ // Empty case.
+ if (!node->fileList) {
+ node->fileList = ret;
+ return ret;
+ }
+ // Front case.
+ int comparison = strcmp(filename, node->fileList->filename);
+ if (comparison < 0) {
+ ret->next = node->fileList;
+ node->fileList = ret;
+ return ret;
+ } else if (comparison == 0) {
+ ++node->fileList->tf;
+ free(ret);
+ return node->fileList;
+ }
+ // Middle case.
+ struct FileListNode *last = node->fileList;
+ for (struct FileListNode *i = node->fileList->next; i; i = i->next) {
+ int comp = strcmp(filename, i->filename);
+ if (comp < 0) {
+ last->next = ret;
+ ret->next = i;
+ return ret;
+ } else if (comp == 0) {
+ ++i->tf;
+ free(ret);
+ return i;
+ }
+ last = i;
+ }
+ // Back case.
+ last->next = ret;
+ return ret;
+}
+
+// Inserts a fileListNode into a list via ordering of tf values. If the tf
+// values are equal, inserts based off the filename in ascending order.
+// Returns the new head of the list.
+struct FileListNode *insertFileOrderedList(struct FileListNode *head,
+ struct FileListNode *insert,
+ double mult) {
+ struct FileListNode *ret = calloc(1, sizeof(struct FileListNode));
+ ret->filename = strdup(insert->filename);
+ ret->tf = insert->tf * mult;
+ // Empty case.
+ if (!head) {
+ return ret;
+ }
+
+ struct FileListNode *prev_duplicate = NULL;
+ struct FileListNode *duplicate = NULL;
+ for (struct FileListNode *i = head; i; i = i->next) {
+ if (!strcmp(ret->filename, i->filename)) {
+ duplicate = i;
+ break;
+ }
+ prev_duplicate = i;
+ }
+
+ if (duplicate != NULL) {
+ if (prev_duplicate != NULL) {
+ prev_duplicate->next = duplicate->next;
+ } else {
+ head = duplicate->next;
+ }
+ duplicate->tf += ret->tf;
+ free(ret->filename);
+ free(ret);
+ return insertFileOrderedList(head, duplicate, mult);
+ }
+
+ struct FileListNode *last = NULL;
+ for (struct FileListNode *i = head; i; i = i->next) {
+ const double scmp = strcmp(ret->filename, i->filename);
+
+ if (ret->tf > i->tf) {
+ break;
+ } else if (insert->tf == i->tf) {
+ if (scmp <= 0) {
+ break;
+ } else {
+ last = i;
+ }
+ } else {
+ last = i;
+ }
+ }
+
+ // Front case.
+ if (!last) {
+ ret->next = head;
+ return ret;
+ }
+
+ // Middle or end case.
+ ret->next = last->next;
+ last->next = ret;
+
+ return head;
+}
diff --git a/comp2521/tf_idf/fileList.h b/comp2521/tf_idf/fileList.h
new file mode 100644
index 0000000..e7afb54
--- /dev/null
+++ b/comp2521/tf_idf/fileList.h
@@ -0,0 +1,35 @@
+#ifndef FILELIST_C_
+#define FILELIST_C_
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "invertedIndex.h"
+
+// Forward declaration for strdup in <string.h>
+char *strdup(const char *);
+
+/*
+ * File list node is unfortuntely defined in "invertedIndex.h":
+ struct FileListNode {
+ char *filename;
+ double tf; // relative tf
+ struct FileListNode *next;
+ };
+*/
+
+// Returns a newly malloc'ed file list node, with fields initialised to zero.
+struct FileListNode *createFileListNode();
+
+// Appends a FileListNode to the end of a FileListNode linked list.
+struct FileListNode *insertFileList(struct InvertedIndexNode *head,
+ char *const word);
+
+// Inserts a fileListNode into a list via ordering of tf values. If the tf
+// values are equal, inserts based off the filename in ascending order.
+// Returns the new head of the list.
+struct FileListNode *insertFileOrderedList(struct FileListNode *head,
+ struct FileListNode *insert,
+ double mult);
+#endif
diff --git a/comp2521/tf_idf/invertedIndex.c b/comp2521/tf_idf/invertedIndex.c
new file mode 100644
index 0000000..e3c21a1
--- /dev/null
+++ b/comp2521/tf_idf/invertedIndex.c
@@ -0,0 +1,153 @@
+#include "invertedIndex.h"
+#include "fileList.h"
+#include "invertedIndexTree.h"
+#include "io.h"
+
+// Removes leading and trailing spaces, converts all characters to lowercase
+// and removes the punctutation marks .,;? if they end the string.
+char *normaliseWord(char *str) {
+ char *front = str;
+ for (int i = 0; str[i] != '\0'; ++i) {
+ if (!isspace(str[i])) {
+ front = &str[i];
+ break;
+ }
+ }
+ char *end = str;
+ for (int i = 0; front[i] != '\0'; ++i) {
+ front[i] = (char)tolower(front[i]);
+ if (front[i + 1] == '\0') {
+ end = &front[i + 1];
+ switch (front[i]) {
+ case '.':
+ case ',':
+ case ';':
+ case '?':
+ front[i] = '\0';
+ --end;
+ }
+ }
+ }
+ return memmove(str, front, (unsigned long)(end - front + 1));
+}
+
+// Opens the collection file, generates an inverted index from those files
+// listed in the collection file. Returns a generated inverted index.
+struct InvertedIndexNode *generateInvertedIndex(char *collectionFilename) {
+ FILE *f_fptr = fopen(collectionFilename, "r");
+ if (!f_fptr) {
+ fprintf(stderr,
+ "fopen(%s, \"r\")\" in generateInvertedIndex returned"
+ " NULL!\n",
+ collectionFilename);
+ return NULL;
+ }
+ // Read file for filenames, read those files for words, insert those words
+ // in an inverted index. Duplicates words increment tf by one for that
+ // file.
+ struct InvertedIndexNode *index = createInvertedIndexNode(NULL);
+ char *filename = NULL;
+ while ((filename = getNextWord(f_fptr))) {
+ FILE *w_fptr = fopen(filename, "r");
+ if (!w_fptr) {
+ fprintf(stderr,
+ "fopen(%s, \"r\") in generateInvertedIndex"
+ " returned NULL!\n",
+ filename);
+ return NULL;
+ }
+ char *word = NULL;
+ int word_count = 0;
+ // Two loops are required to get the total amount of words required in
+ // calculating tf.
+ while ((word = getNextWord(w_fptr))) {
+ ++word_count;
+ }
+ rewind(w_fptr);
+
+ while ((word = getNextWord(w_fptr))) {
+ word = normaliseWord(word);
+ // Base case where index is empty, handle manually.
+ if (index->word == NULL) {
+ index->word = word;
+ }
+ insertInvertedIndexNode(index, word);
+ struct InvertedIndexNode *i_node =
+ searchInvertedIndexNode(index, word);
+ struct FileListNode *f_node = insertFileList(i_node, filename);
+ // There may be multiple words in the same file, so only write to
+ // tf if the word does not exist again in the same file after that
+ // point.
+ if (!exists_after(w_fptr, word)) {
+ f_node->tf /= word_count;
+ }
+ }
+ fclose(w_fptr);
+ }
+ fclose(f_fptr);
+ return index;
+}
+
+// Outputs the given inverted index to a file named invertedIndex.txt.
+// One line per word, alphabetical, ascending order.
+void printInvertedIndex(struct InvertedIndexNode *node) {
+ if (!node) {
+ return;
+ }
+ FILE *fptr = fopen("invertedIndex.txt", "w");
+ if (fptr == NULL) {
+ fprintf(stderr,
+ "fopen(%s, \"r\") in printInvertedIndex"
+ " returned NULL!\n",
+ "invertedIndex.txt");
+ return;
+ }
+ writeInvertedIndex(fptr, node);
+ fclose(fptr);
+}
+
+// Returns an ordered list where each node contains a filename and the
+// corresponding tf-idf value for a given searchWord. You only need to
+// include documents (files) that contain the given searchWord. The list
+// must be in descending order of tf-idf value. If there are multiple
+// files with same tf-idf, order them by their filename in ascending
+// order. D is the total number of documents in the collection.
+struct TfIdfNode *calculateTfIdf(struct InvertedIndexNode *tree,
+ char *searchWord, int D) {
+ struct InvertedIndexNode *i_node =
+ searchInvertedIndexNode(tree, searchWord);
+ struct FileListNode *f_node = i_node->fileList;
+ int f_list_size = 0;
+ for (struct FileListNode *i = f_node; i; i = i->next) {
+ ++f_list_size;
+ }
+ double idf = log10((double)D / (double)f_list_size);
+ struct FileListNode *ret = NULL;
+ // tfidf is idf * tf
+ for (struct FileListNode *i = f_node; i; i = i->next) {
+ ret = insertFileOrderedList(ret, i, idf);
+ }
+ // Fortunately, the contents of the two structs are the same. We can just
+ // cast the pointer to the correct return type, and return that.
+ return (struct TfIdfNode *)ret;
+}
+
+// Returns an ordered list where each node contains a filename and the
+// summation of tf-idf values of all the matching search words for that
+// file. You only need to include documents (files) that contain one or
+// more of the given search words. The list must be in descending order
+// of summation of tf-idf values (tfIdfSum). If there are multiple files
+// with the same tf-idf sum, order them by their filename in ascending
+// order. D is the total number of documents in the collection.
+struct TfIdfNode *retrieve(InvertedIndexBST tree, char *searchWords[], int D) {
+ struct FileListNode *ret = NULL;
+ for (char **i = searchWords; *i; ++i) {
+ struct FileListNode *node =
+ (struct FileListNode *)calculateTfIdf(tree, *i, D);
+
+ for (struct FileListNode *f_i = node; f_i; f_i = f_i->next) {
+ ret = insertFileOrderedList(ret, f_i, 1.0);
+ }
+ }
+ return (struct TfIdfNode *)ret;
+}
diff --git a/comp2521/tf_idf/invertedIndex.h b/comp2521/tf_idf/invertedIndex.h
new file mode 100644
index 0000000..ddf9932
--- /dev/null
+++ b/comp2521/tf_idf/invertedIndex.h
@@ -0,0 +1,78 @@
+// COMP2521 Assignment 1
+// DO NOT MODIFY THIS FILE
+
+#ifndef _INVERTEDINDEX_GUARD
+#define _INVERTEDINDEX_GUARD
+
+struct FileListNode {
+ char *filename;
+ double tf; // relative tf
+ struct FileListNode *next;
+};
+typedef struct FileListNode *FileList;
+
+struct InvertedIndexNode {
+ char *word; // key
+ struct FileListNode *fileList;
+ struct InvertedIndexNode *left;
+ struct InvertedIndexNode *right;
+
+};
+typedef struct InvertedIndexNode *InvertedIndexBST;
+
+struct TfIdfNode {
+ char *filename;
+ double tfIdfSum; // tfidf sum value
+ struct TfIdfNode *next;
+};
+typedef struct TfIdfNode *TfIdfList;
+
+// Functions for Part 1
+
+/**
+ * Normalises a given string. See the spec for details. Note: you should
+ * modify the given string - do not create a copy of it.
+ */
+char *normaliseWord(char *str);
+
+/**
+ * This function opens the collection file with the given name, and then
+ * generates an inverted index from those files listed in the collection
+ * file, as discussed in the spec. It returns the generated inverted
+ * index.
+ */
+InvertedIndexBST generateInvertedIndex(char *collectionFilename);
+
+/**
+ * Outputs the given inverted index to a file named invertedIndex.txt.
+ * The output should contain one line per word, with the words ordered
+ * alphabetically in ascending order. Each list of filenames for a word
+ * should be ordered alphabetically in ascending order.
+*/
+void printInvertedIndex(InvertedIndexBST tree);
+
+// Functions for Part-2
+
+/**
+ * Returns an ordered list where each node contains a filename and the
+ * corresponding tf-idf value for a given searchWord. You only need to
+ * include documents (files) that contain the given searchWord. The list
+ * must be in descending order of tf-idf value. If there are multiple
+ * files with same tf-idf, order them by their filename in ascending
+ * order. D is the total number of documents in the collection.
+ */
+TfIdfList calculateTfIdf(InvertedIndexBST tree, char *searchWord, int D);
+
+/**
+ * Returns an ordered list where each node contains a filename and the
+ * summation of tf-idf values of all the matching search words for that
+ * file. You only need to include documents (files) that contain one or
+ * more of the given search words. The list must be in descending order
+ * of summation of tf-idf values (tfIdfSum). If there are multiple files
+ * with the same tf-idf sum, order them by their filename in ascending
+ * order. D is the total number of documents in the collection.
+ */
+TfIdfList retrieve(InvertedIndexBST tree, char *searchWords[], int D);
+
+#endif
+
diff --git a/comp2521/tf_idf/invertedIndexTree.c b/comp2521/tf_idf/invertedIndexTree.c
new file mode 100644
index 0000000..b66d6a8
--- /dev/null
+++ b/comp2521/tf_idf/invertedIndexTree.c
@@ -0,0 +1,40 @@
+#include "invertedIndexTree.h"
+
+// Returns a newly malloc'ed inverted index node, fields initialised to zero
+// except for word.
+struct InvertedIndexNode *createInvertedIndexNode(char *const word) {
+ struct InvertedIndexNode *ret = calloc(1, sizeof(struct InvertedIndexNode));
+ ret->word = word;
+ return ret;
+}
+
+// Inserts to a FileListNode BST. Uses strcmp to determine priority, with
+// negative values moving into the left and positive values into the right.
+struct InvertedIndexNode *
+insertInvertedIndexNode(struct InvertedIndexNode *node, char *const word) {
+ if (node == NULL) {
+ return createInvertedIndexNode(word);
+ }
+
+ const int comparison = strcmp(word, node->word);
+ if (comparison < 0) {
+ node->left = insertInvertedIndexNode(node->left, word);
+ }
+ if (comparison > 0) {
+ node->right = insertInvertedIndexNode(node->right, word);
+ }
+ return node;
+}
+
+// Searches for an invertedIndexNode via a string.
+struct InvertedIndexNode *
+searchInvertedIndexNode(struct InvertedIndexNode *head, char *const word) {
+ int comparison = strcmp(word, head->word);
+ if (head == NULL || !comparison) {
+ return head;
+ }
+ if (comparison < 0) {
+ return searchInvertedIndexNode(head->left, word);
+ }
+ return searchInvertedIndexNode(head->right, word);
+}
diff --git a/comp2521/tf_idf/invertedIndexTree.h b/comp2521/tf_idf/invertedIndexTree.h
new file mode 100644
index 0000000..29f93c5
--- /dev/null
+++ b/comp2521/tf_idf/invertedIndexTree.h
@@ -0,0 +1,38 @@
+#ifndef INVERTEDINDEXTREE_H_
+#define INVERTEDINDEXTREE_H_
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "invertedIndex.h"
+
+/*
+ * invertedIndexTree is a binary search tree of the following struct, defined
+ * in invertedIndex.h":
+ struct InvertedIndexNode {
+ char *word; // key
+ struct FileListNode *fileList;
+ struct InvertedIndexNode *left;
+ struct InvertedIndexNode *right;
+
+ };
+*/
+
+// Returns a newly malloc'ed inverted index node, fields initialised to zero
+// except for word.
+struct InvertedIndexNode *createInvertedIndexNode(char *const word);
+
+// Inserts to a FileListNode BST. Uses strcmp to determine priority, with
+// negative values moving into the left and positive values into the right.
+struct InvertedIndexNode *
+insertInvertedIndexNode(struct InvertedIndexNode *node, char *const word);
+
+// Searches for an invertedIndexNode based off the a word.
+struct InvertedIndexNode *
+searchInvertedIndexNode(struct InvertedIndexNode *node, char *const word);
+
+// Searches for an invertedIndexNode via a string.
+struct InvertedIndexNode *
+searchInvertedIndexNode(struct InvertedIndexNode *head, char *const word);
+#endif
diff --git a/comp2521/tf_idf/io.c b/comp2521/tf_idf/io.c
new file mode 100644
index 0000000..566b1b9
--- /dev/null
+++ b/comp2521/tf_idf/io.c
@@ -0,0 +1,69 @@
+#include "io.h"
+
+// Returns a malloc'ed char* of the next word in the file. NULL otherwise.
+// Undefined behaviour if the word is greater than 100 chars.
+// We will assume that the specification is referring to length as
+// LENGTH + \0, so 101 bytes should be allocated.
+char *getNextWord(FILE *const fptr) {
+ // We can assume a maximum word length of 100.
+ char *ret = malloc(sizeof(char) * 101);
+ // Read until there is a non-space.
+ for (int c = fgetc(fptr); c != EOF; c = fgetc(fptr)) {
+ if (isspace(c)) {
+ continue;
+ }
+ // Put the char back into the stream so that we may read it correctly
+ // in the next loop.
+ ungetc(c, fptr);
+ break;
+ }
+ int str_pos = 0;
+ // Read the word and write it into ret, until there is another space char.
+ for (int c = fgetc(fptr); c != EOF; c = fgetc(fptr)) {
+ if (isspace(c)) {
+ break;
+ }
+ ret[str_pos++] = (char)c;
+ }
+ ret[str_pos] = '\0';
+ // Only return a value if ret has been written to.
+ if (!str_pos) {
+ free(ret);
+ return NULL;
+ }
+ // Requires +2 as str_pos is (size -1) AND null terminator.
+ return realloc(ret, (size_t)str_pos + 1);
+}
+
+// Returns true if the string in the second argument exists past the fptr in a
+// file. Preserves original position of the fptr.
+bool exists_after(FILE *const fptr, char *const string) {
+ bool exists = false;
+ // Save the position and restore it when the function ends.
+ long pos = ftell(fptr);
+ char *word = NULL;
+ while ((word = getNextWord(fptr))) {
+ word = normaliseWord(word);
+ if (!strcmp(word, string)) {
+ exists = true;
+ break;
+ }
+ }
+ rewind(fptr);
+ fseek(fptr, pos, 0);
+ return exists;
+}
+
+// Outputs the contents of an inverted index to the file specified by fptr.
+void writeInvertedIndex(FILE *const fptr, struct InvertedIndexNode *node) {
+ if (node == NULL) {
+ return;
+ }
+ writeInvertedIndex(fptr, node->left);
+ fprintf(fptr, "%s ", node->word);
+ for (struct FileListNode *i = node->fileList; i != NULL; i = i->next) {
+ fprintf(fptr, "%s (%lf) ", i->filename, i->tf);
+ }
+ fprintf(fptr, "\n");
+ writeInvertedIndex(fptr, node->right);
+}
diff --git a/comp2521/tf_idf/io.h b/comp2521/tf_idf/io.h
new file mode 100644
index 0000000..76b225a
--- /dev/null
+++ b/comp2521/tf_idf/io.h
@@ -0,0 +1,21 @@
+#ifndef IO_H_
+#define IO_H_
+#include <ctype.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "invertedIndex.h"
+
+// Returns a malloc'ed char* of the next word in the file. NULL otherwise.
+// Undefined behaviour if the word is greater than 100 chars.
+char *getNextWord(FILE *const fptr);
+
+// Returns true if a string in the second argument exists past the fptr in a
+// file.
+bool exists_after(FILE *const fptr, char *const string);
+
+// Outputs the contents of an inverted index to the file specified by fptr.
+void writeInvertedIndex(FILE *const fptr, struct InvertedIndexNode *node);
+#endif