diff --git a/CMakeLists.txt b/CMakeLists.txt index a2da730..e1fca1a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,4 +15,4 @@ SHARED ) add_executable(hnsw_test src/test.c) -target_link_libraries(hnsw_test hnswc) +target_link_libraries(hnsw_test hnswc m) diff --git a/dataset/siftsmall_base.fvecs b/dataset/siftsmall_base.fvecs new file mode 100644 index 0000000..e3b90ae Binary files /dev/null and b/dataset/siftsmall_base.fvecs differ diff --git a/dataset/siftsmall_learn.fvecs b/dataset/siftsmall_learn.fvecs new file mode 100644 index 0000000..9ea42f0 Binary files /dev/null and b/dataset/siftsmall_learn.fvecs differ diff --git a/dataset/siftsmall_query.fvecs b/dataset/siftsmall_query.fvecs new file mode 100644 index 0000000..88622e3 Binary files /dev/null and b/dataset/siftsmall_query.fvecs differ diff --git a/inc/utils.h b/inc/utils.h index 090093e..9a593b2 100644 --- a/inc/utils.h +++ b/inc/utils.h @@ -2,5 +2,4 @@ typedef char* VecData; float vec_dist(VecData x, VecData y); - - +void fvecs_read(const char* filename, int* bound, float** vectors, int* num_vectors, int* vector_dimension); \ No newline at end of file diff --git a/src/test.c b/src/test.c index 4721e0e..3209e1e 100644 --- a/src/test.c +++ b/src/test.c @@ -1,8 +1,27 @@ #include +#include #include "hnsw.h" #include "utils.h" + int main() { - printf("Hello, world!\n"); + const char* filename = "../dataset/siftsmall_base.fvecs"; + int bound[2] = {2, 5}; + float* vectors; + int num_vectors, vector_dimension; + fvecs_read(filename, bound, &vectors, &num_vectors, &vector_dimension); + + printf("Number of vectors: %d\n", num_vectors); + printf("Vector dimension: %d\n", vector_dimension); + int i, j; + for (i = 0; i < num_vectors; i++) { + printf("Vector %d: ", i); + for (j = 0; j < vector_dimension; j++) { + printf("%f ", vectors[i * vector_dimension + j]); + } + printf("\n"); + } + + free(vectors); return 0; } \ No newline at end of file diff --git a/src/utils.c b/src/utils.c index e69de29..80032ef 100644 --- a/src/utils.c +++ b/src/utils.c @@ -0,0 +1,92 @@ +#include "utils.h" +#include +#include +#include +#include +#include + +float vec_dist(VecData x, VecData y) { + float dist = 0.0f; + while (*x != '\0' && *y != '\0') { + int xi = *x - '0'; + int yi = *y - '0'; + dist += pow(xi - yi, 2); + x++; + y++; + } + return sqrt(dist); +} + +void fvecs_read(const char* filename, int* bound, float** vectors, int* num_vectors, int* vector_dimension) { + FILE* fid = fopen(filename, "rb"); + if (fid == NULL) { + fprintf(stderr, "I/O error: Unable to open the file %s\n", filename); + exit(EXIT_FAILURE); + } + + // Read the vector size + int d; + fread(&d, sizeof(int), 1, fid); + + // Get the file size + fseek(fid, 0, SEEK_END); + long file_size = ftell(fid); + fseek(fid, 0, SEEK_SET); + + // Get the number of vectors + long vec_size = (long) d * sizeof(float); + long vec_count = (file_size - sizeof(int)) / vec_size; + + // Apply the bounds if specified + int a = 1; + int b = vec_count; + if (bound != NULL) { + if (bound[1] == 1) { + b = bound[0]; + } else if (bound[1] == 2) { + a = bound[0]; + b = bound[1]; + } + } + + // Check if the bounds are valid + if (a < 1 || b < a || b > vec_count) { + *vectors = NULL; + fclose(fid); + return; + } + + // Compute the number of vectors to read + int n = b - a + 1; + + // Read the vectors + *vectors = malloc(n * d * sizeof(float)); + float* ptr = *vectors; + for (int i = 0; i < vec_count; i++) { + // Read the vector size + int vec_d; + fread(&vec_d, sizeof(int), 1, fid); + + // Check if the vector size is correct + if (vec_d != d) { + fprintf(stderr, "Error: Vector %d has incorrect dimension %d (expected %d)\n", i + 1, vec_d, d); + fclose(fid); + free(*vectors); + *vectors = NULL; + return; + } + + // Read the vector data + fread(ptr, sizeof(float), d, fid); + ptr += d; + + // Stop reading if we have read enough vectors + if (i >= b - 1) { + break; + } + } + + *vector_dimension = d; + *num_vectors = n; + fclose(fid); +} \ No newline at end of file