diff options
Diffstat (limited to 'comp2521')
| -rw-r--r-- | comp2521/sna/Dijkstra.c | 153 | ||||
| -rw-r--r-- | comp2521/sna/Dijkstra.h | 99 | ||||
| -rw-r--r-- | comp2521/tf_idf/fileList.c | 118 | ||||
| -rw-r--r-- | comp2521/tf_idf/fileList.h | 35 | ||||
| -rw-r--r-- | comp2521/tf_idf/invertedIndex.c | 153 | ||||
| -rw-r--r-- | comp2521/tf_idf/invertedIndex.h | 78 | ||||
| -rw-r--r-- | comp2521/tf_idf/invertedIndexTree.c | 40 | ||||
| -rw-r--r-- | comp2521/tf_idf/invertedIndexTree.h | 38 | ||||
| -rw-r--r-- | comp2521/tf_idf/io.c | 69 | ||||
| -rw-r--r-- | comp2521/tf_idf/io.h | 21 |
10 files changed, 804 insertions, 0 deletions
diff --git a/comp2521/sna/Dijkstra.c b/comp2521/sna/Dijkstra.c new file mode 100644 index 0000000..2600690 --- /dev/null +++ b/comp2521/sna/Dijkstra.c @@ -0,0 +1,153 @@ +#include "Dijkstra.h" + +#include "PQ.h" +#include <limits.h> +#include <stdio.h> +#include <stdlib.h> + +/* +ShortestPaths { + int numNodes; // The number of vertices in the graph + + Vertex src; // The source vertex + + int *dist; // An array of distances from the source vertex - + // one for each vertex (the distance from src to + // itself is 0) + // dist[v] contains the distance from src to v + + PredNode **pred; // An array of predecessor lists - one for each + // vertex + // pred[v] contains the predecessor list for vertex + // v +} + +PredNode { + Vertex v; + struct PredNode *next; +}; +*/ + +static void free_pred_list(PredNode *head) { + for (PredNode *i = head; i != NULL;) { + PredNode *tmp = i->next; + free(i); + i = tmp; + } +} + +static PredNode *add_pred_back(PredNode *head, const Vertex v) { + PredNode *ret = calloc(1, sizeof(PredNode)); + ret->v = v; + // Empty Case. + if (head == NULL) { + return ret; + } + PredNode *last = head; + // Find the last element in the linked list. + for (PredNode *i = head->next; i != NULL; i = i->next) { + last = i; + } + last->next = ret; + return head; +} + +// Finds the shortest path from src to all other vertices and stores this +// information in a ShortestPaths struct. +// However, if there are multiple shortest paths of the same weight, also store +// such paths as preds. +ShortestPaths dijkstra(Graph g, Vertex src) { + + // Initialise return struct ret and allocate appropriate memory. + ShortestPaths ret; + ret.src = src; + const int num_nodes = GraphNumVertices(g); + ret.numNodes = num_nodes; + ret.dist = calloc((unsigned int)num_nodes, sizeof(int)); + ret.pred = calloc((unsigned int)num_nodes, sizeof(PredNode)); + + // Set all distances as INT_MAX to represent infinity, but the src vertex + // as 0 to ensure it is chosen first. + for (int i = 0; i < num_nodes; ++i) { + ret.dist[i] = INT_MAX; + } + ret.dist[src] = 0; + + // Initialise queue, insert itself in queue and initialise its distance + // as 0. + PQ queue = PQNew(); + PQInsert(queue, src, ret.dist[src]); + + // Perform Dijkstra's algorithm by looping through the queue until empty. + while (!PQIsEmpty(queue)) { + // Dequeue the vertex with the lowest weight. + const Vertex vertex = PQDequeue(queue); + + // Loop through all adjacent vertices for the vertex. + AdjList adjacents = GraphOutIncident(g, vertex); + for (AdjList i = adjacents; i != NULL; i = i->next) { + const int v = i->v; + const int w = i->weight; + const int path = ret.dist[vertex] + w; + + // Inefficient path, continue. + if (ret.dist[v] < path) { + continue; + } + + // Better path than previously discovered. Free the current pred + // list and update distance. + if (ret.dist[v] > path) { + ret.dist[v] = path; + PQInsert(queue, v, ret.dist[v]); + free_pred_list(ret.pred[v]); + ret.pred[v] = NULL; + } + + // Update pred list. + ret.pred[v] = add_pred_back(ret.pred[v], vertex); + } + } + + // After performing the algorithm, we must set all INT_MAX distances back + // to zero to conform to the spec. + for (int i = 0; i < num_nodes; ++i) { + if (ret.dist[i] == INT_MAX) { + ret.dist[i] = 0; + } + } + + return ret; +} + +// Prints out the ShortestPaths structure. +// Does not need to comply with any requirements. +void showShortestPaths(ShortestPaths sps) { + printf("Shortest path\n"); + printf("\tNumNodes: %d\n", sps.numNodes); + printf("\tsrc: %d\n", sps.src); + printf("\tdist\n"); + for (int i = 0; i < sps.numNodes; ++i) { + printf("\t\t%d: %d\n", i, sps.dist[i]); + } + printf("\tpred\n"); + for (int i = 0; i < sps.numNodes; ++i) { + printf("\t\t%d\n", i); + for (PredNode *j = sps.pred[i]; j != NULL; j = j->next) { + printf("\t\t\t%d\n", j->v); + } + } +} + +// Frees all memory associated with a ShortestPaths structure. +void freeShortestPaths(ShortestPaths sps) { + for (int i = 0; i < sps.numNodes; ++i) { + for (PredNode *j = sps.pred[i]; j != NULL;) { + PredNode *next = j->next; + free(j); + j = next; + } + } + free(sps.pred); + free(sps.dist); +} diff --git a/comp2521/sna/Dijkstra.h b/comp2521/sna/Dijkstra.h new file mode 100644 index 0000000..0c3578b --- /dev/null +++ b/comp2521/sna/Dijkstra.h @@ -0,0 +1,99 @@ +// Dijkstra ADT interface +// COMP2521 Assignment 2 + +// Note: You MUST NOT modify this file. + +#include <stdbool.h> + +#include "Graph.h" + +typedef struct PredNode { + Vertex v; + struct PredNode *next; +} PredNode; + +typedef struct ShortestPaths { + int numNodes; // The number of vertices in the graph + + Vertex src; // The source vertex + + int *dist; // An array of distances from the source vertex - + // one for each vertex (the distance from src to + // itself is 0) + // dist[v] contains the distance from src to v + + PredNode **pred; // An array of predecessor lists - one for each + // vertex + // pred[v] contains the predecessor list for vertex + // v +} ShortestPaths; + +/** + * Finds all shortest paths from a given source vertex to all other + * vertices as discussed in the lectures. + * + * This function offers one **additional feature** - if there is more + * than one shortest path from the source vertex to another vertex, a + * vertex could have multiple predecessors (see spec for an example). + * The function must keep track of multiple predecessors for each vertex + * (if they exist) by storing the predecessor(s) in the corresponding + * linked list for that vertex. This will be discussed in detail in the + * lecture. + * + * For example, suppose that while finding the shortest paths from the + * source vertex 0 to all other vertices, we discovered that vertex 1 + * has two possible predecessors on different shortest paths. + * + * Node 0 + * Distance + * 0 : X + * 1 : 2 + * 2 : 1 + * Preds + * 0 : NULL + * 1 : [0]->[2]->NULL + * 2 : [0]->NULL + * + * Node 1 + * Distance + * 0 : 2 + * 1 : X + * 2 : 3 + * Preds + * 0 : [1]->NULL + * 1 : NULL + * 2 : [0]->NULL + * + * Node 2 + * Distance + * 0 : 3 + * 1 : 1 + * 2 : X + * Preds + * 0 : [1]->NULL + * 1 : [2]->NULL + * 2 : NULL + * + * The function returns a 'ShortestPaths' structure with the required + * information: + * - the number of vertices in the graph + * - the source vertex + * - distance array + * - array of predecessor lists + */ +ShortestPaths dijkstra(Graph g, Vertex src); + +/** + * This function is for you to print out the ShortestPaths structure + * while you are debugging/testing your implementation. + * + * We will not call this function during testing, so you may print out + * the given ShortestPaths structure in whatever format you want. + */ +void showShortestPaths(ShortestPaths sps); + +/** + * Frees all memory associated with the given ShortestPaths structure. + */ +void freeShortestPaths(ShortestPaths sps); + diff --git a/comp2521/tf_idf/fileList.c b/comp2521/tf_idf/fileList.c new file mode 100644 index 0000000..53dc0f9 --- /dev/null +++ b/comp2521/tf_idf/fileList.c @@ -0,0 +1,118 @@ +#include "fileList.h" + +// Returns a newly malloc'ed file list node, with word initialised to word +// and tf set to 1. +struct FileListNode *createFileListNode(char *const word) { + struct FileListNode *ret = calloc(1, sizeof(struct FileListNode)); + ret->filename = word; + ret->tf = 1; + return ret; +} + +// Inserts a fileListNode into an InvertedIndexNode->fileList based off a +// string in alphabetical order. Increments tf by 1 already exists. +// Returns new or preexisting fileListNode *. +struct FileListNode *insertFileList(struct InvertedIndexNode *node, + char *const filename) { + struct FileListNode *ret = createFileListNode(filename); + // Empty case. + if (!node->fileList) { + node->fileList = ret; + return ret; + } + // Front case. + int comparison = strcmp(filename, node->fileList->filename); + if (comparison < 0) { + ret->next = node->fileList; + node->fileList = ret; + return ret; + } else if (comparison == 0) { + ++node->fileList->tf; + free(ret); + return node->fileList; + } + // Middle case. + struct FileListNode *last = node->fileList; + for (struct FileListNode *i = node->fileList->next; i; i = i->next) { + int comp = strcmp(filename, i->filename); + if (comp < 0) { + last->next = ret; + ret->next = i; + return ret; + } else if (comp == 0) { + ++i->tf; + free(ret); + return i; + } + last = i; + } + // Back case. + last->next = ret; + return ret; +} + +// Inserts a fileListNode into a list via ordering of tf values. If the tf +// values are equal, inserts based off the filename in ascending order. +// Returns the new head of the list. +struct FileListNode *insertFileOrderedList(struct FileListNode *head, + struct FileListNode *insert, + double mult) { + struct FileListNode *ret = calloc(1, sizeof(struct FileListNode)); + ret->filename = strdup(insert->filename); + ret->tf = insert->tf * mult; + // Empty case. + if (!head) { + return ret; + } + + struct FileListNode *prev_duplicate = NULL; + struct FileListNode *duplicate = NULL; + for (struct FileListNode *i = head; i; i = i->next) { + if (!strcmp(ret->filename, i->filename)) { + duplicate = i; + break; + } + prev_duplicate = i; + } + + if (duplicate != NULL) { + if (prev_duplicate != NULL) { + prev_duplicate->next = duplicate->next; + } else { + head = duplicate->next; + } + duplicate->tf += ret->tf; + free(ret->filename); + free(ret); + return insertFileOrderedList(head, duplicate, mult); + } + + struct FileListNode *last = NULL; + for (struct FileListNode *i = head; i; i = i->next) { + const double scmp = strcmp(ret->filename, i->filename); + + if (ret->tf > i->tf) { + break; + } else if (insert->tf == i->tf) { + if (scmp <= 0) { + break; + } else { + last = i; + } + } else { + last = i; + } + } + + // Front case. + if (!last) { + ret->next = head; + return ret; + } + + // Middle or end case. + ret->next = last->next; + last->next = ret; + + return head; +} diff --git a/comp2521/tf_idf/fileList.h b/comp2521/tf_idf/fileList.h new file mode 100644 index 0000000..e7afb54 --- /dev/null +++ b/comp2521/tf_idf/fileList.h @@ -0,0 +1,35 @@ +#ifndef FILELIST_C_ +#define FILELIST_C_ + +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> + +#include "invertedIndex.h" + +// Forward declaration for strdup in <string.h> +char *strdup(const char *); + +/* + * File list node is unfortuntely defined in "invertedIndex.h": + struct FileListNode { + char *filename; + double tf; // relative tf + struct FileListNode *next; + }; +*/ + +// Returns a newly malloc'ed file list node, with fields initialised to zero. +struct FileListNode *createFileListNode(); + +// Appends a FileListNode to the end of a FileListNode linked list. +struct FileListNode *insertFileList(struct InvertedIndexNode *head, + char *const word); + +// Inserts a fileListNode into a list via ordering of tf values. If the tf +// values are equal, inserts based off the filename in ascending order. +// Returns the new head of the list. +struct FileListNode *insertFileOrderedList(struct FileListNode *head, + struct FileListNode *insert, + double mult); +#endif diff --git a/comp2521/tf_idf/invertedIndex.c b/comp2521/tf_idf/invertedIndex.c new file mode 100644 index 0000000..e3c21a1 --- /dev/null +++ b/comp2521/tf_idf/invertedIndex.c @@ -0,0 +1,153 @@ +#include "invertedIndex.h" +#include "fileList.h" +#include "invertedIndexTree.h" +#include "io.h" + +// Removes leading and trailing spaces, converts all characters to lowercase +// and removes the punctutation marks .,;? if they end the string. +char *normaliseWord(char *str) { + char *front = str; + for (int i = 0; str[i] != '\0'; ++i) { + if (!isspace(str[i])) { + front = &str[i]; + break; + } + } + char *end = str; + for (int i = 0; front[i] != '\0'; ++i) { + front[i] = (char)tolower(front[i]); + if (front[i + 1] == '\0') { + end = &front[i + 1]; + switch (front[i]) { + case '.': + case ',': + case ';': + case '?': + front[i] = '\0'; + --end; + } + } + } + return memmove(str, front, (unsigned long)(end - front + 1)); +} + +// Opens the collection file, generates an inverted index from those files +// listed in the collection file. Returns a generated inverted index. +struct InvertedIndexNode *generateInvertedIndex(char *collectionFilename) { + FILE *f_fptr = fopen(collectionFilename, "r"); + if (!f_fptr) { + fprintf(stderr, + "fopen(%s, \"r\")\" in generateInvertedIndex returned" + " NULL!\n", + collectionFilename); + return NULL; + } + // Read file for filenames, read those files for words, insert those words + // in an inverted index. Duplicates words increment tf by one for that + // file. + struct InvertedIndexNode *index = createInvertedIndexNode(NULL); + char *filename = NULL; + while ((filename = getNextWord(f_fptr))) { + FILE *w_fptr = fopen(filename, "r"); + if (!w_fptr) { + fprintf(stderr, + "fopen(%s, \"r\") in generateInvertedIndex" + " returned NULL!\n", + filename); + return NULL; + } + char *word = NULL; + int word_count = 0; + // Two loops are required to get the total amount of words required in + // calculating tf. + while ((word = getNextWord(w_fptr))) { + ++word_count; + } + rewind(w_fptr); + + while ((word = getNextWord(w_fptr))) { + word = normaliseWord(word); + // Base case where index is empty, handle manually. + if (index->word == NULL) { + index->word = word; + } + insertInvertedIndexNode(index, word); + struct InvertedIndexNode *i_node = + searchInvertedIndexNode(index, word); + struct FileListNode *f_node = insertFileList(i_node, filename); + // There may be multiple words in the same file, so only write to + // tf if the word does not exist again in the same file after that + // point. + if (!exists_after(w_fptr, word)) { + f_node->tf /= word_count; + } + } + fclose(w_fptr); + } + fclose(f_fptr); + return index; +} + +// Outputs the given inverted index to a file named invertedIndex.txt. +// One line per word, alphabetical, ascending order. +void printInvertedIndex(struct InvertedIndexNode *node) { + if (!node) { + return; + } + FILE *fptr = fopen("invertedIndex.txt", "w"); + if (fptr == NULL) { + fprintf(stderr, + "fopen(%s, \"r\") in printInvertedIndex" + " returned NULL!\n", + "invertedIndex.txt"); + return; + } + writeInvertedIndex(fptr, node); + fclose(fptr); +} + +// Returns an ordered list where each node contains a filename and the +// corresponding tf-idf value for a given searchWord. You only need to +// include documents (files) that contain the given searchWord. The list +// must be in descending order of tf-idf value. If there are multiple +// files with same tf-idf, order them by their filename in ascending +// order. D is the total number of documents in the collection. +struct TfIdfNode *calculateTfIdf(struct InvertedIndexNode *tree, + char *searchWord, int D) { + struct InvertedIndexNode *i_node = + searchInvertedIndexNode(tree, searchWord); + struct FileListNode *f_node = i_node->fileList; + int f_list_size = 0; + for (struct FileListNode *i = f_node; i; i = i->next) { + ++f_list_size; + } + double idf = log10((double)D / (double)f_list_size); + struct FileListNode *ret = NULL; + // tfidf is idf * tf + for (struct FileListNode *i = f_node; i; i = i->next) { + ret = insertFileOrderedList(ret, i, idf); + } + // Fortunately, the contents of the two structs are the same. We can just + // cast the pointer to the correct return type, and return that. + return (struct TfIdfNode *)ret; +} + +// Returns an ordered list where each node contains a filename and the +// summation of tf-idf values of all the matching search words for that +// file. You only need to include documents (files) that contain one or +// more of the given search words. The list must be in descending order +// of summation of tf-idf values (tfIdfSum). If there are multiple files +// with the same tf-idf sum, order them by their filename in ascending +// order. D is the total number of documents in the collection. +struct TfIdfNode *retrieve(InvertedIndexBST tree, char *searchWords[], int D) { + struct FileListNode *ret = NULL; + for (char **i = searchWords; *i; ++i) { + struct FileListNode *node = + (struct FileListNode *)calculateTfIdf(tree, *i, D); + + for (struct FileListNode *f_i = node; f_i; f_i = f_i->next) { + ret = insertFileOrderedList(ret, f_i, 1.0); + } + } + return (struct TfIdfNode *)ret; +} diff --git a/comp2521/tf_idf/invertedIndex.h b/comp2521/tf_idf/invertedIndex.h new file mode 100644 index 0000000..ddf9932 --- /dev/null +++ b/comp2521/tf_idf/invertedIndex.h @@ -0,0 +1,78 @@ +// COMP2521 Assignment 1 +// DO NOT MODIFY THIS FILE + +#ifndef _INVERTEDINDEX_GUARD +#define _INVERTEDINDEX_GUARD + +struct FileListNode { + char *filename; + double tf; // relative tf + struct FileListNode *next; +}; +typedef struct FileListNode *FileList; + +struct InvertedIndexNode { + char *word; // key + struct FileListNode *fileList; + struct InvertedIndexNode *left; + struct InvertedIndexNode *right; + +}; +typedef struct InvertedIndexNode *InvertedIndexBST; + +struct TfIdfNode { + char *filename; + double tfIdfSum; // tfidf sum value + struct TfIdfNode *next; +}; +typedef struct TfIdfNode *TfIdfList; + +// Functions for Part 1 + +/** + * Normalises a given string. See the spec for details. Note: you should + * modify the given string - do not create a copy of it. + */ +char *normaliseWord(char *str); + +/** + * This function opens the collection file with the given name, and then + * generates an inverted index from those files listed in the collection + * file, as discussed in the spec. It returns the generated inverted + * index. + */ +InvertedIndexBST generateInvertedIndex(char *collectionFilename); + +/** + * Outputs the given inverted index to a file named invertedIndex.txt. + * The output should contain one line per word, with the words ordered + * alphabetically in ascending order. Each list of filenames for a word + * should be ordered alphabetically in ascending order. +*/ +void printInvertedIndex(InvertedIndexBST tree); + +// Functions for Part-2 + +/** + * Returns an ordered list where each node contains a filename and the + * corresponding tf-idf value for a given searchWord. You only need to + * include documents (files) that contain the given searchWord. The list + * must be in descending order of tf-idf value. If there are multiple + * files with same tf-idf, order them by their filename in ascending + * order. D is the total number of documents in the collection. + */ +TfIdfList calculateTfIdf(InvertedIndexBST tree, char *searchWord, int D); + +/** + * Returns an ordered list where each node contains a filename and the + * summation of tf-idf values of all the matching search words for that + * file. You only need to include documents (files) that contain one or + * more of the given search words. The list must be in descending order + * of summation of tf-idf values (tfIdfSum). If there are multiple files + * with the same tf-idf sum, order them by their filename in ascending + * order. D is the total number of documents in the collection. + */ +TfIdfList retrieve(InvertedIndexBST tree, char *searchWords[], int D); + +#endif + diff --git a/comp2521/tf_idf/invertedIndexTree.c b/comp2521/tf_idf/invertedIndexTree.c new file mode 100644 index 0000000..b66d6a8 --- /dev/null +++ b/comp2521/tf_idf/invertedIndexTree.c @@ -0,0 +1,40 @@ +#include "invertedIndexTree.h" + +// Returns a newly malloc'ed inverted index node, fields initialised to zero +// except for word. +struct InvertedIndexNode *createInvertedIndexNode(char *const word) { + struct InvertedIndexNode *ret = calloc(1, sizeof(struct InvertedIndexNode)); + ret->word = word; + return ret; +} + +// Inserts to a FileListNode BST. Uses strcmp to determine priority, with +// negative values moving into the left and positive values into the right. +struct InvertedIndexNode * +insertInvertedIndexNode(struct InvertedIndexNode *node, char *const word) { + if (node == NULL) { + return createInvertedIndexNode(word); + } + + const int comparison = strcmp(word, node->word); + if (comparison < 0) { + node->left = insertInvertedIndexNode(node->left, word); + } + if (comparison > 0) { + node->right = insertInvertedIndexNode(node->right, word); + } + return node; +} + +// Searches for an invertedIndexNode via a string. +struct InvertedIndexNode * +searchInvertedIndexNode(struct InvertedIndexNode *head, char *const word) { + int comparison = strcmp(word, head->word); + if (head == NULL || !comparison) { + return head; + } + if (comparison < 0) { + return searchInvertedIndexNode(head->left, word); + } + return searchInvertedIndexNode(head->right, word); +} diff --git a/comp2521/tf_idf/invertedIndexTree.h b/comp2521/tf_idf/invertedIndexTree.h new file mode 100644 index 0000000..29f93c5 --- /dev/null +++ b/comp2521/tf_idf/invertedIndexTree.h @@ -0,0 +1,38 @@ +#ifndef INVERTEDINDEXTREE_H_ +#define INVERTEDINDEXTREE_H_ + +#include <math.h> +#include <stdlib.h> +#include <string.h> + +#include "invertedIndex.h" + +/* + * invertedIndexTree is a binary search tree of the following struct, defined + * in invertedIndex.h": + struct InvertedIndexNode { + char *word; // key + struct FileListNode *fileList; + struct InvertedIndexNode *left; + struct InvertedIndexNode *right; + + }; +*/ + +// Returns a newly malloc'ed inverted index node, fields initialised to zero +// except for word. +struct InvertedIndexNode *createInvertedIndexNode(char *const word); + +// Inserts to a FileListNode BST. Uses strcmp to determine priority, with +// negative values moving into the left and positive values into the right. +struct InvertedIndexNode * +insertInvertedIndexNode(struct InvertedIndexNode *node, char *const word); + +// Searches for an invertedIndexNode based off the a word. +struct InvertedIndexNode * +searchInvertedIndexNode(struct InvertedIndexNode *node, char *const word); + +// Searches for an invertedIndexNode via a string. +struct InvertedIndexNode * +searchInvertedIndexNode(struct InvertedIndexNode *head, char *const word); +#endif diff --git a/comp2521/tf_idf/io.c b/comp2521/tf_idf/io.c new file mode 100644 index 0000000..566b1b9 --- /dev/null +++ b/comp2521/tf_idf/io.c @@ -0,0 +1,69 @@ +#include "io.h" + +// Returns a malloc'ed char* of the next word in the file. NULL otherwise. +// Undefined behaviour if the word is greater than 100 chars. +// We will assume that the specification is referring to length as +// LENGTH + \0, so 101 bytes should be allocated. +char *getNextWord(FILE *const fptr) { + // We can assume a maximum word length of 100. + char *ret = malloc(sizeof(char) * 101); + // Read until there is a non-space. + for (int c = fgetc(fptr); c != EOF; c = fgetc(fptr)) { + if (isspace(c)) { + continue; + } + // Put the char back into the stream so that we may read it correctly + // in the next loop. + ungetc(c, fptr); + break; + } + int str_pos = 0; + // Read the word and write it into ret, until there is another space char. + for (int c = fgetc(fptr); c != EOF; c = fgetc(fptr)) { + if (isspace(c)) { + break; + } + ret[str_pos++] = (char)c; + } + ret[str_pos] = '\0'; + // Only return a value if ret has been written to. + if (!str_pos) { + free(ret); + return NULL; + } + // Requires +2 as str_pos is (size -1) AND null terminator. + return realloc(ret, (size_t)str_pos + 1); +} + +// Returns true if the string in the second argument exists past the fptr in a +// file. Preserves original position of the fptr. +bool exists_after(FILE *const fptr, char *const string) { + bool exists = false; + // Save the position and restore it when the function ends. + long pos = ftell(fptr); + char *word = NULL; + while ((word = getNextWord(fptr))) { + word = normaliseWord(word); + if (!strcmp(word, string)) { + exists = true; + break; + } + } + rewind(fptr); + fseek(fptr, pos, 0); + return exists; +} + +// Outputs the contents of an inverted index to the file specified by fptr. +void writeInvertedIndex(FILE *const fptr, struct InvertedIndexNode *node) { + if (node == NULL) { + return; + } + writeInvertedIndex(fptr, node->left); + fprintf(fptr, "%s ", node->word); + for (struct FileListNode *i = node->fileList; i != NULL; i = i->next) { + fprintf(fptr, "%s (%lf) ", i->filename, i->tf); + } + fprintf(fptr, "\n"); + writeInvertedIndex(fptr, node->right); +} diff --git a/comp2521/tf_idf/io.h b/comp2521/tf_idf/io.h new file mode 100644 index 0000000..76b225a --- /dev/null +++ b/comp2521/tf_idf/io.h @@ -0,0 +1,21 @@ +#ifndef IO_H_ +#define IO_H_ +#include <ctype.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "invertedIndex.h" + +// Returns a malloc'ed char* of the next word in the file. NULL otherwise. +// Undefined behaviour if the word is greater than 100 chars. +char *getNextWord(FILE *const fptr); + +// Returns true if a string in the second argument exists past the fptr in a +// file. +bool exists_after(FILE *const fptr, char *const string); + +// Outputs the contents of an inverted index to the file specified by fptr. +void writeInvertedIndex(FILE *const fptr, struct InvertedIndexNode *node); +#endif |
