initial commit

author: Nicolas James <Eele1Ephe7uZahRie@tutanota.com> 2025-02-13 18:00:17 +1100
committer: Nicolas James <Eele1Ephe7uZahRie@tutanota.com> 2025-02-13 18:00:17 +1100
commit: 98cef5e9a772602d42acfcf233838c760424db9a (patch)
tree: 5277fa1d7cc0a69a0f166fcbf10fd320f345f049 /comp2521
10 files changed, 804 insertions, 0 deletions
diff --git a/comp2521/sna/Dijkstra.c b/comp2521/sna/Dijkstra.c
new file mode 100644
index 0000000..2600690
--- /dev/null
+++ b/comp2521/sna/Dijkstra.c
@@ -0,0 +1,153 @@
+#include "Dijkstra.h"
+
+#include "PQ.h"
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/*
+ShortestPaths {
+        int numNodes;    // The number of vertices in the graph
+
+        Vertex src;      // The source vertex
+
+        int *dist;       // An  array  of distances from the source vertex -
+                         // one for each vertex (the distance  from  src  to
+                         // itself is 0)
+                         // dist[v] contains the distance from src to v
+
+        PredNode **pred; // An  array  of  predecessor  lists - one for each
+                         // vertex
+                         // pred[v] contains the predecessor list for vertex
+                         // v
+}
+
+PredNode {
+        Vertex v;
+        struct PredNode *next;
+};
+*/
+
+static void free_pred_list(PredNode *head) {
+    for (PredNode *i = head; i != NULL;) {
+        PredNode *tmp = i->next;
+        free(i);
+        i = tmp;
+    }
+}
+
+static PredNode *add_pred_back(PredNode *head, const Vertex v) {
+    PredNode *ret = calloc(1, sizeof(PredNode));
+    ret->v = v;
+    // Empty Case.
+    if (head == NULL) {
+        return ret;
+    }
+    PredNode *last = head;
+    // Find the last element in the linked list.
+    for (PredNode *i = head->next; i != NULL; i = i->next) {
+        last = i;
+    }
+    last->next = ret;
+    return head;
+}
+
+// Finds the shortest path from src to all other vertices and stores this
+// information in a ShortestPaths struct.
+// However, if there are multiple shortest paths of the same weight, also store
+// such paths as preds.
+ShortestPaths dijkstra(Graph g, Vertex src) {
+
+    // Initialise return struct ret and allocate appropriate memory.
+    ShortestPaths ret;
+    ret.src = src;
+    const int num_nodes = GraphNumVertices(g);
+    ret.numNodes = num_nodes;
+    ret.dist = calloc((unsigned int)num_nodes, sizeof(int));
+    ret.pred = calloc((unsigned int)num_nodes, sizeof(PredNode));
+
+    // Set all distances as INT_MAX to represent infinity, but the src vertex
+    // as 0 to ensure it is chosen first.
+    for (int i = 0; i < num_nodes; ++i) {
+        ret.dist[i] = INT_MAX;
+    }
+    ret.dist[src] = 0;
+
+    // Initialise queue, insert itself in queue and initialise its distance
+    // as 0.
+    PQ queue = PQNew();
+    PQInsert(queue, src, ret.dist[src]);
+
+    // Perform Dijkstra's algorithm by looping through the queue until empty.
+    while (!PQIsEmpty(queue)) {
+        // Dequeue the vertex with the lowest weight.
+        const Vertex vertex = PQDequeue(queue);
+
+        // Loop through all adjacent vertices for the vertex.
+        AdjList adjacents = GraphOutIncident(g, vertex);
+        for (AdjList i = adjacents; i != NULL; i = i->next) {
+            const int v = i->v;
+            const int w = i->weight;
+            const int path = ret.dist[vertex] + w;
+
+            // Inefficient path, continue.
+            if (ret.dist[v] < path) {
+                continue;
+            }
+
+            // Better path than previously discovered. Free the current pred
+            // list and update distance.
+            if (ret.dist[v] > path) {
+                ret.dist[v] = path;
+                PQInsert(queue, v, ret.dist[v]);
+                free_pred_list(ret.pred[v]);
+                ret.pred[v] = NULL;
+            }
+
+            // Update pred list.
+            ret.pred[v] = add_pred_back(ret.pred[v], vertex);
+        }
+    }
+
+    // After performing the algorithm, we must set all INT_MAX distances back
+    // to zero to conform to the spec.
+    for (int i = 0; i < num_nodes; ++i) {
+        if (ret.dist[i] == INT_MAX) {
+            ret.dist[i] = 0;
+        }
+    }
+
+    return ret;
+}
+
+// Prints out the ShortestPaths structure.
+// Does not need to comply with any requirements.
+void showShortestPaths(ShortestPaths sps) {
+    printf("Shortest path\n");
+    printf("\tNumNodes: %d\n", sps.numNodes);
+    printf("\tsrc: %d\n", sps.src);
+    printf("\tdist\n");
+    for (int i = 0; i < sps.numNodes; ++i) {
+        printf("\t\t%d: %d\n", i, sps.dist[i]);
+    }
+    printf("\tpred\n");
+    for (int i = 0; i < sps.numNodes; ++i) {
+        printf("\t\t%d\n", i);
+        for (PredNode *j = sps.pred[i]; j != NULL; j = j->next) {
+            printf("\t\t\t%d\n", j->v);
+        }
+    }
+}
+
+// Frees all memory associated with a ShortestPaths structure.
+void freeShortestPaths(ShortestPaths sps) {
+    for (int i = 0; i < sps.numNodes; ++i) {
+        for (PredNode *j = sps.pred[i]; j != NULL;) {
+            PredNode *next = j->next;
+            free(j);
+            j = next;
+        }
+    }
+    free(sps.pred);
+    free(sps.dist);
+}
diff --git a/comp2521/sna/Dijkstra.h b/comp2521/sna/Dijkstra.h
new file mode 100644
index 0000000..0c3578b
--- /dev/null
+++ b/comp2521/sna/Dijkstra.h
@@ -0,0 +1,99 @@
+// Dijkstra ADT interface
+// COMP2521 Assignment 2
+
+// Note: You MUST NOT modify this file.
+
+#include <stdbool.h>
+
+#include "Graph.h"
+
+typedef struct PredNode {
+	Vertex v;
+	struct PredNode *next;
+} PredNode;
+
+typedef struct ShortestPaths {
+	int numNodes;    // The number of vertices in the graph
+	
+	Vertex src;      // The source vertex
+	
+	int *dist;       // An  array  of distances from the source vertex -
+	                 // one for each vertex (the distance  from  src  to
+	                 // itself is 0)
+	                 // dist[v] contains the distance from src to v
+	                 
+	PredNode **pred; // An  array  of  predecessor  lists - one for each
+	                 // vertex
+	                 // pred[v] contains the predecessor list for vertex
+	                 // v
+} ShortestPaths;
+
+/**
+ * Finds  all  shortest  paths  from  a given source vertex to all other
+ * vertices as discussed in the lectures.
+ *
+ * This  function  offers  one **additional feature** - if there is more
+ * than one shortest path from the source vertex to  another  vertex,  a
+ * vertex  could  have  multiple predecessors (see spec for an example).
+ * The function must keep track of multiple predecessors for each vertex
+ * (if  they  exist)  by storing the predecessor(s) in the corresponding
+ * linked list for that vertex. This will be discussed in detail in  the
+ * lecture.
+ * 
+ * For  example,  suppose that while finding the shortest paths from the
+ * source vertex 0 to all other vertices, we discovered  that  vertex  1
+ * has two possible predecessors on different shortest paths. 
+ * 
+ * Node 0
+ *   Distance
+ *     0 : X
+ *     1 : 2
+ *     2 : 1
+ *   Preds
+ *     0 : NULL
+ *     1 : [0]->[2]->NULL
+ *     2 : [0]->NULL
+ * 
+ * Node 1
+ *   Distance
+ *     0 : 2
+ *     1 : X
+ *     2 : 3
+ *   Preds
+ *     0 : [1]->NULL
+ *     1 : NULL
+ *     2 : [0]->NULL
+ * 
+ * Node 2
+ *   Distance
+ *     0 : 3
+ *     1 : 1
+ *     2 : X
+ *   Preds
+ *     0 : [1]->NULL
+ *     1 : [2]->NULL
+ *     2 : NULL
+ * 
+ * The  function  returns  a 'ShortestPaths' structure with the required
+ * information:
+ * - the number of vertices in the graph
+ * - the source vertex
+ * - distance array
+ * - array of predecessor lists
+ */
+ShortestPaths dijkstra(Graph g, Vertex src);
+
+/**
+ * This  function  is  for  you to print out the ShortestPaths structure
+ * while you are debugging/testing your implementation. 
+ * 
+ * We  will  not call this function during testing, so you may print out
+ * the given ShortestPaths structure in whatever format you want.
+ */
+void showShortestPaths(ShortestPaths sps);
+
+/**
+ * Frees all memory associated with the given ShortestPaths structure.
+ */
+void freeShortestPaths(ShortestPaths sps);
+
diff --git a/comp2521/tf_idf/fileList.c b/comp2521/tf_idf/fileList.c
new file mode 100644
index 0000000..53dc0f9
--- /dev/null
+++ b/comp2521/tf_idf/fileList.c
@@ -0,0 +1,118 @@
+#include "fileList.h"
+
+// Returns a newly malloc'ed file list node, with word initialised to word
+// and tf set to 1.
+struct FileListNode *createFileListNode(char *const word) {
+    struct FileListNode *ret = calloc(1, sizeof(struct FileListNode));
+    ret->filename = word;
+    ret->tf = 1;
+    return ret;
+}
+
+// Inserts a fileListNode into an InvertedIndexNode->fileList based off a
+// string in alphabetical order. Increments tf by 1 already exists.
+// Returns new or preexisting fileListNode *.
+struct FileListNode *insertFileList(struct InvertedIndexNode *node,
+                                    char *const filename) {
+    struct FileListNode *ret = createFileListNode(filename);
+    // Empty case.
+    if (!node->fileList) {
+        node->fileList = ret;
+        return ret;
+    }
+    // Front case.
+    int comparison = strcmp(filename, node->fileList->filename);
+    if (comparison < 0) {
+        ret->next = node->fileList;
+        node->fileList = ret;
+        return ret;
+    } else if (comparison == 0) {
+        ++node->fileList->tf;
+        free(ret);
+        return node->fileList;
+    }
+    // Middle case.
+    struct FileListNode *last = node->fileList;
+    for (struct FileListNode *i = node->fileList->next; i; i = i->next) {
+        int comp = strcmp(filename, i->filename);
+        if (comp < 0) {
+            last->next = ret;
+            ret->next = i;
+            return ret;
+        } else if (comp == 0) {
+            ++i->tf;
+            free(ret);
+            return i;
+        }
+        last = i;
+    }
+    // Back case.
+    last->next = ret;
+    return ret;
+}
+
+// Inserts a fileListNode into a list via ordering of tf values. If the tf
+// values are equal, inserts based off the filename in ascending order.
+// Returns the new head of the list.
+struct FileListNode *insertFileOrderedList(struct FileListNode *head,
+                                           struct FileListNode *insert,
+                                           double mult) {
+    struct FileListNode *ret = calloc(1, sizeof(struct FileListNode));
+    ret->filename = strdup(insert->filename);
+    ret->tf = insert->tf * mult;
+    // Empty case.
+    if (!head) {
+        return ret;
+    }
+
+    struct FileListNode *prev_duplicate = NULL;
+    struct FileListNode *duplicate = NULL;
+    for (struct FileListNode *i = head; i; i = i->next) {
+        if (!strcmp(ret->filename, i->filename)) {
+            duplicate = i;
+            break;
+        }
+        prev_duplicate = i;
+    }
+
+    if (duplicate != NULL) {
+        if (prev_duplicate != NULL) {
+            prev_duplicate->next = duplicate->next;
+        } else {
+            head = duplicate->next;
+        }
+        duplicate->tf += ret->tf;
+        free(ret->filename);
+        free(ret);
+        return insertFileOrderedList(head, duplicate, mult);
+    }
+
+    struct FileListNode *last = NULL;
+    for (struct FileListNode *i = head; i; i = i->next) {
+        const double scmp = strcmp(ret->filename, i->filename);
+
+        if (ret->tf > i->tf) {
+            break;
+        } else if (insert->tf == i->tf) {
+            if (scmp <= 0) {
+                break;
+            } else {
+                last = i;
+            }
+        } else {
+            last = i;
+        }
+    }
+
+    // Front case.
+    if (!last) {
+        ret->next = head;
+        return ret;
+    }
+
+    // Middle or end case.
+    ret->next = last->next;
+    last->next = ret;
+
+    return head;
+}
diff --git a/comp2521/tf_idf/fileList.h b/comp2521/tf_idf/fileList.h
new file mode 100644
index 0000000..e7afb54
--- /dev/null
+++ b/comp2521/tf_idf/fileList.h
@@ -0,0 +1,35 @@
+#ifndef FILELIST_C_
+#define FILELIST_C_
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "invertedIndex.h"
+
+// Forward declaration for strdup in <string.h>
+char *strdup(const char *);
+
+/*
+ * File list node is unfortuntely defined in "invertedIndex.h":
+    struct FileListNode {
+        char *filename;
+        double tf; // relative tf
+        struct FileListNode *next;
+    };
+*/
+
+// Returns a newly malloc'ed file list node, with fields initialised to zero.
+struct FileListNode *createFileListNode();
+
+// Appends a FileListNode to the end of a FileListNode linked list.
+struct FileListNode *insertFileList(struct InvertedIndexNode *head,
+                                    char *const word);
+
+// Inserts a fileListNode into a list via ordering of tf values. If the tf
+// values are equal, inserts based off the filename in ascending order.
+// Returns the new head of the list.
+struct FileListNode *insertFileOrderedList(struct FileListNode *head,
+                                           struct FileListNode *insert,
+                                           double mult);
+#endif
diff --git a/comp2521/tf_idf/invertedIndex.c b/comp2521/tf_idf/invertedIndex.c
new file mode 100644
index 0000000..e3c21a1
--- /dev/null
+++ b/comp2521/tf_idf/invertedIndex.c
@@ -0,0 +1,153 @@
+#include "invertedIndex.h"
+#include "fileList.h"
+#include "invertedIndexTree.h"
+#include "io.h"
+
+// Removes leading and trailing spaces, converts all characters to lowercase
+// and removes the punctutation marks .,;? if they end the string.
+char *normaliseWord(char *str) {
+    char *front = str;
+    for (int i = 0; str[i] != '\0'; ++i) {
+        if (!isspace(str[i])) {
+            front = &str[i];
+            break;
+        }
+    }
+    char *end = str;
+    for (int i = 0; front[i] != '\0'; ++i) {
+        front[i] = (char)tolower(front[i]);
+        if (front[i + 1] == '\0') {
+            end = &front[i + 1];
+            switch (front[i]) {
+            case '.':
+            case ',':
+            case ';':
+            case '?':
+                front[i] = '\0';
+                --end;
+            }
+        }
+    }
+    return memmove(str, front, (unsigned long)(end - front + 1));
+}
+
+// Opens the collection file, generates an inverted index from those files
+// listed in the collection file. Returns a generated inverted index.
+struct InvertedIndexNode *generateInvertedIndex(char *collectionFilename) {
+    FILE *f_fptr = fopen(collectionFilename, "r");
+    if (!f_fptr) {
+        fprintf(stderr,
+                "fopen(%s, \"r\")\" in generateInvertedIndex returned"
+                " NULL!\n",
+                collectionFilename);
+        return NULL;
+    }
+    // Read file for filenames, read those files for words, insert those words
+    // in an inverted index. Duplicates words increment tf by one for that
+    // file.
+    struct InvertedIndexNode *index = createInvertedIndexNode(NULL);
+    char *filename = NULL;
+    while ((filename = getNextWord(f_fptr))) {
+        FILE *w_fptr = fopen(filename, "r");
+        if (!w_fptr) {
+            fprintf(stderr,
+                    "fopen(%s, \"r\") in generateInvertedIndex"
+                    " returned NULL!\n",
+                    filename);
+            return NULL;
+        }
+        char *word = NULL;
+        int word_count = 0;
+        // Two loops are required to get the total amount of words required in
+        // calculating tf.
+        while ((word = getNextWord(w_fptr))) {
+            ++word_count;
+        }
+        rewind(w_fptr);
+
+        while ((word = getNextWord(w_fptr))) {
+            word = normaliseWord(word);
+            // Base case where index is empty, handle manually.
+            if (index->word == NULL) {
+                index->word = word;
+            }
+            insertInvertedIndexNode(index, word);
+            struct InvertedIndexNode *i_node =
+                searchInvertedIndexNode(index, word);
+            struct FileListNode *f_node = insertFileList(i_node, filename);
+            // There may be multiple words in the same file, so only write to
+            // tf if the word does not exist again in the same file after that
+            // point.
+            if (!exists_after(w_fptr, word)) {
+                f_node->tf /= word_count;
+            }
+        }
+        fclose(w_fptr);
+    }
+    fclose(f_fptr);
+    return index;
+}
+
+// Outputs the given inverted index to a file named invertedIndex.txt.
+// One line per word, alphabetical, ascending order.
+void printInvertedIndex(struct InvertedIndexNode *node) {
+    if (!node) {
+        return;
+    }
+    FILE *fptr = fopen("invertedIndex.txt", "w");
+    if (fptr == NULL) {
+        fprintf(stderr,
+                "fopen(%s, \"r\") in printInvertedIndex"
+                " returned NULL!\n",
+                "invertedIndex.txt");
+        return;
+    }
+    writeInvertedIndex(fptr, node);
+    fclose(fptr);
+}
+
+// Returns  an  ordered list where each node contains a filename and the
+// corresponding tf-idf value for a given searchWord. You only  need  to
+// include documents (files) that contain the given searchWord. The list
+// must  be  in  descending order of tf-idf value. If there are multiple
+// files with same tf-idf, order them by  their  filename  in  ascending
+// order. D is the total number of documents in the collection.
+struct TfIdfNode *calculateTfIdf(struct InvertedIndexNode *tree,
+                                 char *searchWord, int D) {
+    struct InvertedIndexNode *i_node =
+        searchInvertedIndexNode(tree, searchWord);
+    struct FileListNode *f_node = i_node->fileList;
+    int f_list_size = 0;
+    for (struct FileListNode *i = f_node; i; i = i->next) {
+        ++f_list_size;
+    }
+    double idf = log10((double)D / (double)f_list_size);
+    struct FileListNode *ret = NULL;
+    // tfidf is idf * tf
+    for (struct FileListNode *i = f_node; i; i = i->next) {
+        ret = insertFileOrderedList(ret, i, idf);
+    }
+    // Fortunately, the contents of the two structs are the same. We can just
+    // cast the pointer to the correct return type, and return that.
+    return (struct TfIdfNode *)ret;
+}
+
+// Returns  an  ordered list where each node contains a filename and the
+// summation of tf-idf values of all the matching search words for  that
+// file.  You only need to include documents (files) that contain one or
+// more of the given search words. The list must be in descending  order
+// of summation of tf-idf values (tfIdfSum). If there are multiple files
+// with  the  same tf-idf sum, order them by their filename in ascending
+// order. D is the total number of documents in the collection.
+struct TfIdfNode *retrieve(InvertedIndexBST tree, char *searchWords[], int D) {
+    struct FileListNode *ret = NULL;
+    for (char **i = searchWords; *i; ++i) {
+        struct FileListNode *node =
+            (struct FileListNode *)calculateTfIdf(tree, *i, D);
+
+        for (struct FileListNode *f_i = node; f_i; f_i = f_i->next) {
+            ret = insertFileOrderedList(ret, f_i, 1.0);
+        }
+    }
+    return (struct TfIdfNode *)ret;
+}
diff --git a/comp2521/tf_idf/invertedIndex.h b/comp2521/tf_idf/invertedIndex.h
new file mode 100644
index 0000000..ddf9932
--- /dev/null
+++ b/comp2521/tf_idf/invertedIndex.h
@@ -0,0 +1,78 @@
+// COMP2521 Assignment 1
+// DO NOT MODIFY THIS FILE
+
+#ifndef _INVERTEDINDEX_GUARD
+#define _INVERTEDINDEX_GUARD
+
+struct FileListNode {
+	char *filename;
+	double tf; // relative tf
+	struct FileListNode *next;
+};
+typedef struct FileListNode *FileList;
+
+struct InvertedIndexNode {
+	char *word; // key
+	struct FileListNode *fileList;
+	struct InvertedIndexNode *left;
+	struct InvertedIndexNode *right;
+
+};
+typedef struct InvertedIndexNode *InvertedIndexBST;
+
+struct TfIdfNode {
+	char *filename;
+	double tfIdfSum; // tfidf sum value
+	struct TfIdfNode *next;
+};
+typedef struct TfIdfNode *TfIdfList;
+
+// Functions for Part 1
+
+/**
+ * Normalises a given string. See the spec for details. Note: you should
+ * modify the given string - do not create a copy of it.
+ */
+char *normaliseWord(char *str);
+
+/**
+ * This function opens the collection file with the given name, and then
+ * generates an inverted index from those files listed in the collection
+ * file,  as  discussed  in  the spec. It returns the generated inverted
+ * index.
+ */
+InvertedIndexBST generateInvertedIndex(char *collectionFilename);
+
+/**
+ * Outputs  the  given inverted index to a file named invertedIndex.txt.
+ * The output should contain one line per word, with the  words  ordered
+ * alphabetically  in ascending order. Each list of filenames for a word
+ * should be ordered alphabetically in ascending order.
+*/
+void printInvertedIndex(InvertedIndexBST tree); 
+
+// Functions for Part-2
+
+/**
+ * Returns  an  ordered list where each node contains a filename and the 
+ * corresponding tf-idf value for a given searchWord. You only  need  to
+ * include documents (files) that contain the given searchWord. The list
+ * must  be  in  descending order of tf-idf value. If there are multiple
+ * files with same tf-idf, order them by  their  filename  in  ascending
+ * order. D is the total number of documents in the collection.
+ */
+TfIdfList calculateTfIdf(InvertedIndexBST tree, char *searchWord, int D);
+
+/**
+ * Returns  an  ordered list where each node contains a filename and the
+ * summation of tf-idf values of all the matching search words for  that
+ * file.  You only need to include documents (files) that contain one or
+ * more of the given search words. The list must be in descending  order
+ * of summation of tf-idf values (tfIdfSum). If there are multiple files
+ * with  the  same tf-idf sum, order them by their filename in ascending
+ * order. D is the total number of documents in the collection.
+ */
+TfIdfList retrieve(InvertedIndexBST tree, char *searchWords[], int D);
+
+#endif
+
diff --git a/comp2521/tf_idf/invertedIndexTree.c b/comp2521/tf_idf/invertedIndexTree.c
new file mode 100644
index 0000000..b66d6a8
--- /dev/null
+++ b/comp2521/tf_idf/invertedIndexTree.c
@@ -0,0 +1,40 @@
+#include "invertedIndexTree.h"
+
+// Returns a newly malloc'ed inverted index node, fields initialised to zero
+// except for word.
+struct InvertedIndexNode *createInvertedIndexNode(char *const word) {
+    struct InvertedIndexNode *ret = calloc(1, sizeof(struct InvertedIndexNode));
+    ret->word = word;
+    return ret;
+}
+
+// Inserts to a FileListNode BST. Uses strcmp to determine priority, with
+// negative values moving into the left and positive values into the right.
+struct InvertedIndexNode *
+insertInvertedIndexNode(struct InvertedIndexNode *node, char *const word) {
+    if (node == NULL) {
+        return createInvertedIndexNode(word);
+    }
+
+    const int comparison = strcmp(word, node->word);
+    if (comparison < 0) {
+        node->left = insertInvertedIndexNode(node->left, word);
+    }
+    if (comparison > 0) {
+        node->right = insertInvertedIndexNode(node->right, word);
+    }
+    return node;
+}
+
+// Searches for an invertedIndexNode via a string.
+struct InvertedIndexNode *
+searchInvertedIndexNode(struct InvertedIndexNode *head, char *const word) {
+    int comparison = strcmp(word, head->word);
+    if (head == NULL || !comparison) {
+        return head;
+    }
+    if (comparison < 0) {
+        return searchInvertedIndexNode(head->left, word);
+    }
+    return searchInvertedIndexNode(head->right, word);
+}
diff --git a/comp2521/tf_idf/invertedIndexTree.h b/comp2521/tf_idf/invertedIndexTree.h
new file mode 100644
index 0000000..29f93c5
--- /dev/null
+++ b/comp2521/tf_idf/invertedIndexTree.h
@@ -0,0 +1,38 @@
+#ifndef INVERTEDINDEXTREE_H_
+#define INVERTEDINDEXTREE_H_
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "invertedIndex.h"
+
+/*
+ * invertedIndexTree is a binary search tree of the following struct, defined
+ * in invertedIndex.h":
+    struct InvertedIndexNode {
+        char *word; // key
+        struct FileListNode *fileList;
+        struct InvertedIndexNode *left;
+        struct InvertedIndexNode *right;
+
+    };
+*/
+
+// Returns a newly malloc'ed inverted index node, fields initialised to zero
+// except for word.
+struct InvertedIndexNode *createInvertedIndexNode(char *const word);
+
+// Inserts to a FileListNode BST. Uses strcmp to determine priority, with
+// negative values moving into the left and positive values into the right.
+struct InvertedIndexNode *
+insertInvertedIndexNode(struct InvertedIndexNode *node, char *const word);
+
+// Searches for an invertedIndexNode based off the a word.
+struct InvertedIndexNode *
+searchInvertedIndexNode(struct InvertedIndexNode *node, char *const word);
+
+// Searches for an invertedIndexNode via a string.
+struct InvertedIndexNode *
+searchInvertedIndexNode(struct InvertedIndexNode *head, char *const word);
+#endif
diff --git a/comp2521/tf_idf/io.c b/comp2521/tf_idf/io.c
new file mode 100644
index 0000000..566b1b9
--- /dev/null
+++ b/comp2521/tf_idf/io.c
@@ -0,0 +1,69 @@
+#include "io.h"
+
+// Returns a malloc'ed char* of the next word in the file. NULL otherwise.
+// Undefined behaviour if the word is greater than 100 chars.
+// We will assume that the specification is referring to length as
+// LENGTH + \0, so 101 bytes should be allocated.
+char *getNextWord(FILE *const fptr) {
+    // We can assume a maximum word length of 100.
+    char *ret = malloc(sizeof(char) * 101);
+    // Read until there is a non-space.
+    for (int c = fgetc(fptr); c != EOF; c = fgetc(fptr)) {
+        if (isspace(c)) {
+            continue;
+        }
+        // Put the char back into the stream so that we may read it correctly
+        // in the next loop.
+        ungetc(c, fptr);
+        break;
+    }
+    int str_pos = 0;
+    // Read the word and write it into ret, until there is another space char.
+    for (int c = fgetc(fptr); c != EOF; c = fgetc(fptr)) {
+        if (isspace(c)) {
+            break;
+        }
+        ret[str_pos++] = (char)c;
+    }
+    ret[str_pos] = '\0';
+    // Only return a value if ret has been written to.
+    if (!str_pos) {
+        free(ret);
+        return NULL;
+    }
+    // Requires +2 as str_pos is (size -1) AND null terminator.
+    return realloc(ret, (size_t)str_pos + 1);
+}
+
+// Returns true if the string in the second argument exists past the fptr in a
+// file. Preserves original position of the fptr.
+bool exists_after(FILE *const fptr, char *const string) {
+    bool exists = false;
+    // Save the position and restore it when the function ends.
+    long pos = ftell(fptr);
+    char *word = NULL;
+    while ((word = getNextWord(fptr))) {
+        word = normaliseWord(word);
+        if (!strcmp(word, string)) {
+            exists = true;
+            break;
+        }
+    }
+    rewind(fptr);
+    fseek(fptr, pos, 0);
+    return exists;
+}
+
+// Outputs the contents of an inverted index to the file specified by fptr.
+void writeInvertedIndex(FILE *const fptr, struct InvertedIndexNode *node) {
+    if (node == NULL) {
+        return;
+    }
+    writeInvertedIndex(fptr, node->left);
+    fprintf(fptr, "%s ", node->word);
+    for (struct FileListNode *i = node->fileList; i != NULL; i = i->next) {
+        fprintf(fptr, "%s (%lf) ", i->filename, i->tf);
+    }
+    fprintf(fptr, "\n");
+    writeInvertedIndex(fptr, node->right);
+}
diff --git a/comp2521/tf_idf/io.h b/comp2521/tf_idf/io.h
new file mode 100644
index 0000000..76b225a
--- /dev/null
+++ b/comp2521/tf_idf/io.h
@@ -0,0 +1,21 @@
+#ifndef IO_H_
+#define IO_H_
+#include <ctype.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "invertedIndex.h"
+
+// Returns a malloc'ed char* of the next word in the file. NULL otherwise.
+// Undefined behaviour if the word is greater than 100 chars.
+char *getNextWord(FILE *const fptr);
+
+// Returns true if a string in the second argument exists past the fptr in a
+// file.
+bool exists_after(FILE *const fptr, char *const string);
+
+// Outputs the contents of an inverted index to the file specified by fptr.
+void writeInvertedIndex(FILE *const fptr, struct InvertedIndexNode *node);
+#endif
author	Nicolas James <Eele1Ephe7uZahRie@tutanota.com>	2025-02-13 18:00:17 +1100
committer	Nicolas James <Eele1Ephe7uZahRie@tutanota.com>	2025-02-13 18:00:17 +1100
commit	98cef5e9a772602d42acfcf233838c760424db9a (patch)
tree	5277fa1d7cc0a69a0f166fcbf10fd320f345f049 /comp2521