diff options
| author | Nicolas James <Eele1Ephe7uZahRie@tutanota.com> | 2025-02-13 18:00:17 +1100 |
|---|---|---|
| committer | Nicolas James <Eele1Ephe7uZahRie@tutanota.com> | 2025-02-13 18:00:17 +1100 |
| commit | 98cef5e9a772602d42acfcf233838c760424db9a (patch) | |
| tree | 5277fa1d7cc0a69a0f166fcbf10fd320f345f049 /comp2521/tf_idf/io.c | |
initial commit
Diffstat (limited to 'comp2521/tf_idf/io.c')
| -rw-r--r-- | comp2521/tf_idf/io.c | 69 |
1 files changed, 69 insertions, 0 deletions
diff --git a/comp2521/tf_idf/io.c b/comp2521/tf_idf/io.c new file mode 100644 index 0000000..566b1b9 --- /dev/null +++ b/comp2521/tf_idf/io.c @@ -0,0 +1,69 @@ +#include "io.h" + +// Returns a malloc'ed char* of the next word in the file. NULL otherwise. +// Undefined behaviour if the word is greater than 100 chars. +// We will assume that the specification is referring to length as +// LENGTH + \0, so 101 bytes should be allocated. +char *getNextWord(FILE *const fptr) { + // We can assume a maximum word length of 100. + char *ret = malloc(sizeof(char) * 101); + // Read until there is a non-space. + for (int c = fgetc(fptr); c != EOF; c = fgetc(fptr)) { + if (isspace(c)) { + continue; + } + // Put the char back into the stream so that we may read it correctly + // in the next loop. + ungetc(c, fptr); + break; + } + int str_pos = 0; + // Read the word and write it into ret, until there is another space char. + for (int c = fgetc(fptr); c != EOF; c = fgetc(fptr)) { + if (isspace(c)) { + break; + } + ret[str_pos++] = (char)c; + } + ret[str_pos] = '\0'; + // Only return a value if ret has been written to. + if (!str_pos) { + free(ret); + return NULL; + } + // Requires +2 as str_pos is (size -1) AND null terminator. + return realloc(ret, (size_t)str_pos + 1); +} + +// Returns true if the string in the second argument exists past the fptr in a +// file. Preserves original position of the fptr. +bool exists_after(FILE *const fptr, char *const string) { + bool exists = false; + // Save the position and restore it when the function ends. + long pos = ftell(fptr); + char *word = NULL; + while ((word = getNextWord(fptr))) { + word = normaliseWord(word); + if (!strcmp(word, string)) { + exists = true; + break; + } + } + rewind(fptr); + fseek(fptr, pos, 0); + return exists; +} + +// Outputs the contents of an inverted index to the file specified by fptr. +void writeInvertedIndex(FILE *const fptr, struct InvertedIndexNode *node) { + if (node == NULL) { + return; + } + writeInvertedIndex(fptr, node->left); + fprintf(fptr, "%s ", node->word); + for (struct FileListNode *i = node->fileList; i != NULL; i = i->next) { + fprintf(fptr, "%s (%lf) ", i->filename, i->tf); + } + fprintf(fptr, "\n"); + writeInvertedIndex(fptr, node->right); +} |
