-
Notifications
You must be signed in to change notification settings - Fork 10
Update Duplicate Detection #77
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
5f2e901
994e99f
ea6430f
cf0dbb5
a861fb4
b4634f3
63a4dc2
22e55a3
4b656a4
fa28afa
81a1d2f
e624d40
b0e000b
0874365
01d918a
42a65f1
b281037
ee0bca7
0c9eb2c
394764e
9b8cc19
17156b7
648e30a
29640a1
0fa62d3
d3b571c
06bae81
13fd4b7
e83c15f
070cfe3
8795aaf
2e948d8
4c347be
d177522
7b49a5b
e9c10a5
b6abca7
8c86b5f
63fa5ba
d0d4f54
3b86627
6b83c67
66029f5
fce7a2c
b9defb8
636814e
495597e
ab71333
65e4458
68d1c68
8ba449e
2be1d22
814fcfa
8917ae2
6eeeb2d
c561728
ee9c410
f8aa100
7dfc69b
d1c2185
fd289ff
dbc01f2
4e96440
c7c6fcb
b411968
0fcf5e5
f47c678
e799efa
b9a9d36
c9241cf
5fd3d96
3e57fa6
32ef9e0
00bdc0e
118e22b
19cd50b
ebb9585
6544e4e
9de3d40
9cac9b8
98d8a11
a0936d6
f8572ad
ddd99f6
d17c043
8d5364e
7e2979b
59eda46
cf26c09
f447c19
9449178
fd84d79
00c07f1
b540a38
e27308e
bda9eac
62eebee
d6e9d98
71b3c60
fa048eb
99f0a29
33d6150
b2e0f35
326a475
06f07b9
0c8f5cc
011d952
8506a39
f1e6bfe
e705a68
9a88787
dfbd6c4
07129f8
cc85e7b
b369ee4
5ea6848
0a96f95
44df6fc
a041dca
ef1107b
de74a99
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -65,3 +65,4 @@ perf.data.old | |
| .idea/ | ||
| .vscode/ | ||
| centrallix-os/tmp/* | ||
| centrallix-os/datasets/ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks like you need to run make in the centrallix-doc/Widgets directory to propagate the XML change to the relevant HTML files.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for showing me how to build the HTML, done. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,161 @@ | ||
| #ifndef CLUSTERS_H | ||
| #define CLUSTERS_H | ||
|
|
||
| /************************************************************************/ | ||
| /* Centrallix Application Server System */ | ||
| /* Centrallix Core */ | ||
| /* */ | ||
| /* Copyright (C) 1998-2026 LightSys Technology Services, Inc. */ | ||
| /* */ | ||
| /* This program is free software; you can redistribute it and/or modify */ | ||
| /* it under the terms of the GNU General Public License as published by */ | ||
| /* the Free Software Foundation; either version 2 of the License, or */ | ||
| /* (at your option) any later version. */ | ||
| /* */ | ||
| /* This program is distributed in the hope that it will be useful, */ | ||
| /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ | ||
| /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ | ||
| /* GNU General Public License for more details. */ | ||
| /* */ | ||
| /* You should have received a copy of the GNU General Public License */ | ||
| /* along with this program; if not, write to the Free Software */ | ||
| /* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA */ | ||
| /* 02111-1307 USA */ | ||
| /* */ | ||
| /* A copy of the GNU General Public License has been included in this */ | ||
| /* distribution in the file "COPYING". */ | ||
| /* */ | ||
| /* Module: lib_cluster.c, lib_cluster.h */ | ||
| /* Author: Israel Fuller */ | ||
| /* Creation: September 29, 2025 */ | ||
| /* Description Clustering library used to cluster and search data with */ | ||
| /* cosine or Levenshtein (aka. edit distance) similarity */ | ||
| /* measures. Used by the "clustering driver". */ | ||
| /* For more information on how to use this library, see */ | ||
| /* string-similarity.md in the centrallix-sysdoc folder. */ | ||
| /************************************************************************/ | ||
|
|
||
| #include <stdlib.h> | ||
| #include <stdbool.h> | ||
|
|
||
| #ifdef CXLIB_INTERNAL | ||
| #include "xarray.h" | ||
| #else | ||
| #include "cxlib/xarray.h" | ||
| #endif | ||
|
|
||
| /** This file has additional documentation in string_similarity.md. **/ | ||
|
|
||
|
|
||
| /*** This value defines the number of dimensions used for a sparse | ||
| *** vector. The higher the number, the fewer collisions will be | ||
| *** encountered when using these vectors for cosine comparisons. | ||
| *** This is also called the vector table size, if viewing the | ||
| *** vector as a hash table of character pairs. | ||
| *** | ||
| *** 2147483647 is the signed int max, and is also a prime number. | ||
| *** Using this value ensures that the longest run of 0s will not | ||
| *** cause an int underflow with the current encoding scheme. | ||
| *** | ||
| *** Unfortunately, we can't use a number this large yet because | ||
| *** kmeans algorithm creates densely allocated centroids with | ||
| *** `CA_NUM_DIMS` dimensions, so a large number causes it to fail. | ||
| *** This, we use 251 as the largest prime number less than 256, | ||
| *** giving us a decent balance between collision reduction and | ||
| *** kmeans centroid performance/memory overhead. | ||
| ***/ | ||
| #define CA_NUM_DIMS 251 | ||
|
|
||
| /*** The character used to create a pair with the first and last characters | ||
| *** of a string. Currently set to 96, the character just before 'a' (97) | ||
| *** in the ASCII table. | ||
| ***/ | ||
| #define CA_BOUNDARY_CHAR ((unsigned char)('a' - 1)) | ||
|
|
||
| /** Types. **/ | ||
| typedef int* pVector; /* Sparse vector. */ | ||
| typedef double* pCentroid; /* Dense centroid. */ | ||
| #define CENTROID_SIZE (CA_NUM_DIMS * sizeof(double)) | ||
|
|
||
| /*** Information about detected matching pairs. | ||
| *** | ||
| *** @param i The index into the provided data for the first element of the pair. | ||
| *** @param j The index into the provided data for the second element of the pair. | ||
| *** @param similarity A number from 0 to 1, from a similarity function, showing | ||
| *** how similar the pairs are. | ||
| ***/ | ||
| typedef struct | ||
| { | ||
| unsigned int i, j; | ||
| double similarity; | ||
| } | ||
| Pair, *pPair; | ||
|
|
||
|
|
||
| /** Edit distance function. **/ | ||
| int ca_edit_dist(const char* str1, const char* str2, const size_t str1_length, const size_t str2_length); | ||
|
|
||
| /** Vector functions. **/ | ||
| pVector ca_build_vector(const char* str); | ||
| unsigned int ca_sparse_len(const pVector vector); | ||
| void ca_print_vector(const pVector vector); | ||
| void ca_free_vector(pVector sparse_vector); | ||
|
|
||
| /** k-means function. **/ | ||
| int ca_kmeans( | ||
| pVector* vectors, | ||
| const unsigned int num_vectors, | ||
| const unsigned int num_clusters, | ||
| const unsigned int max_iter, | ||
| const double min_improvement, | ||
| unsigned int* labels, | ||
| double* vector_sims, | ||
| bool auto_seed); | ||
|
|
||
| /** Vector helper macros. **/ | ||
| #define ca_is_empty(vector) (vector[0] == -CA_NUM_DIMS) | ||
| /*** Note: Given that CA_NUM_DIMS == 251, ca_build_vector("") will give the | ||
| *** vector we check for in the ca_has_no_pairs() macro, [-172, 11, -78], | ||
| *** which has a single pair of boundary characters. | ||
| *** If CA_NUM_DIMS is modified, this macro will need to be updated, hence the | ||
| *** compiler directive causing it to be undefined in this case, likely leading | ||
| *** to a lot of compiler or linker issues to remind the developer about this. | ||
| ***/ | ||
| #if CA_NUM_DIMS == 251 | ||
| #define ca_has_no_pairs(vector) \ | ||
| ({ \ | ||
| __typeof__ (vector) _v = (vector); \ | ||
| _v[0] == -172 && _v[1] == 11 && _v[2] == -78; \ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the value of CA_NUM_DIMS changes, this function will not work properly. It could also use a comment to explain what the 3 numbers are meant to represent (I assume a number of empty slots, the value produced by the pair made by start/end, and then the remaining empty slots)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated, added comments and used compiler directives to make sure that the macro won't be defined anymore if this assumption is not met. I think that would cause a bunch of compiler errors, which should remind anyone who changes |
||
| }) | ||
| #endif | ||
|
|
||
| /** Comparison functions (see ca_search()). **/ | ||
| double ca_cos_compare(void* v1, void* v2); | ||
| double ca_lev_compare(void* str1, void* str2); | ||
| bool ca_eql(pVector v1, pVector v2); | ||
|
|
||
| /** Similarity search functions. **/ | ||
| void* ca_most_similar( | ||
| void* target, | ||
| void** data, | ||
| const unsigned int num_data, | ||
| const double (*similarity)(void*, void*), | ||
| const double threshold); | ||
| pXArray ca_sliding_search( | ||
| void** data, | ||
| const unsigned int num_data, | ||
| const unsigned int window_size, | ||
| const double (*similarity)(void*, void*), | ||
| const double threshold, | ||
| pXArray maybe_pairs); | ||
| pXArray ca_complete_search( | ||
| void** data, | ||
| const unsigned int num_data, | ||
| const double (*similarity)(void*, void*), | ||
| const double threshold, | ||
| pXArray maybe_pairs); | ||
|
|
||
| /** Module management functions. **/ | ||
| void ca_init(void); | ||
|
|
||
| #endif /* End of .h file. */ | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,79 @@ | ||
| #ifndef GLYPH_H | ||
| #define GLYPH_H | ||
|
|
||
| /************************************************************************/ | ||
| /* Centrallix Application Server System */ | ||
| /* Centrallix Core */ | ||
| /* */ | ||
| /* Copyright (C) 1998-2026 LightSys Technology Services, Inc. */ | ||
| /* */ | ||
| /* This program is free software; you can redistribute it and/or modify */ | ||
| /* it under the terms of the GNU General Public License as published by */ | ||
| /* the Free Software Foundation; either version 2 of the License, or */ | ||
| /* (at your option) any later version. */ | ||
| /* */ | ||
| /* This program is distributed in the hope that it will be useful, */ | ||
| /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ | ||
| /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ | ||
| /* GNU General Public License for more details. */ | ||
| /* */ | ||
| /* You should have received a copy of the GNU General Public License */ | ||
| /* along with this program; if not, write to the Free Software */ | ||
| /* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA */ | ||
| /* 02111-1307 USA */ | ||
| /* */ | ||
| /* A copy of the GNU General Public License has been included in this */ | ||
| /* distribution in the file "COPYING". */ | ||
| /* */ | ||
| /* Module: glyph.h */ | ||
| /* Author: Israel Fuller */ | ||
| /* Creation: October 27, 2025 */ | ||
| /* Description: A simple debug visualizer to make pretty patterns in */ | ||
| /* developer's terminal which can be surprisingly useful */ | ||
| /* for debugging algorithms. */ | ||
| /************************************************************************/ | ||
|
|
||
| #include <stdlib.h> | ||
|
|
||
| /** Uncomment to activate glyphs. **/ | ||
| /** Should not be enabled in production code on the master branch. */ | ||
| // #define ENABLE_GLYPHS | ||
|
|
||
| #ifdef ENABLE_GLYPHS | ||
| #define glyph_print(s) printf("%s", s); | ||
|
|
||
| /*** Initialize a simple debug visualizer to make pretty patterns in the | ||
| *** developer's terminal. Great for when you need to run a long task and | ||
| *** want a super simple way to make sure it's still working. | ||
| *** | ||
| *** @attention - Relies on storing data in variables in scope, so calling | ||
| *** glyph() requires a call to glyph_init() previously in the same scope. | ||
| *** | ||
| *** @param name The symbol name of the visualizer. | ||
| *** @param str The string printed for the visualization. | ||
| *** @param interval The number of invocations of glyph() required to print. | ||
| *** @param flush Whether to flush on output. | ||
| ***/ | ||
| #define glyph_init(name, str, interval, flush) \ | ||
| const char* vis_##name##_str = str; \ | ||
| const unsigned int vis_##name##_interval = interval; \ | ||
| const bool vis_##name##_flush = flush; \ | ||
| unsigned int vis_##name##_i = 0u; | ||
|
|
||
| /*** Invoke a visualizer. | ||
| *** | ||
| *** @param name The name of the visualizer to invoke. | ||
| ***/ | ||
| #define glyph(name) \ | ||
| if (++vis_##name##_i % vis_##name##_interval == 0) \ | ||
| { \ | ||
| glyph_print(vis_##name##_str); \ | ||
| if (vis_##name##_flush) fflush(stdout); \ | ||
| } | ||
| #else | ||
| #define glyph_print(str) | ||
| #define glyph_init(name, str, interval, flush) | ||
| #define glyph(name) | ||
| #endif | ||
|
|
||
| #endif /* End of .h file. */ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The centrallix-os/datasets folder does not exist in the repo, and its not used anywhere by anything in centrallix that I can tell. I would recommend either removing it from the git ignore if it's only used by you, or if it's needed by something (perhaps a Kardia branch?) adding a simple readme so the folder exists in the repo.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added
README.md.