|
static PyObject *foldcomp_open(PyObject* /* self */, PyObject* args, PyObject* kwargs) { |
|
PyObject* path; |
|
PyObject* user_ids = NULL; |
|
PyObject* decompress = NULL; |
|
PyObject* err_on_missing = NULL; // Raise an error if the file is missing. Default: False |
|
|
|
static const char *kwlist[] = {"path", "ids", "decompress", "err_on_missing", NULL}; |
|
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O&|$OOO", const_cast<char**>(kwlist), PyUnicode_FSConverter, &path, &user_ids, &decompress, &err_on_missing)) { |
|
return NULL; |
|
} |
|
if (path == NULL) { |
|
PyErr_SetString(PyExc_TypeError, "path must be a path-like object"); |
|
return NULL; |
|
} |
|
const char* pathCStr = PyBytes_AS_STRING(path); |
|
if (pathCStr == NULL) { |
|
Py_DECREF(path); |
|
PyErr_SetString(PyExc_TypeError, "path must be a path-like object"); |
|
return NULL; |
|
} |
|
|
|
if (user_ids != NULL && !PyList_Check(user_ids)) { |
|
Py_DECREF(path); |
|
PyErr_SetString(PyExc_TypeError, "user_ids must be a list."); |
|
return NULL; |
|
} |
|
|
|
if (decompress != NULL && !PyBool_Check(decompress)) { |
|
Py_DECREF(path); |
|
PyErr_SetString(PyExc_TypeError, "decompress must be a boolean"); |
|
return NULL; |
|
} |
|
|
|
if (err_on_missing != NULL && !PyBool_Check(err_on_missing)) { |
|
Py_DECREF(path); |
|
PyErr_SetString(PyExc_TypeError, "err_on_missing must be a boolean"); |
|
return NULL; |
|
} |
|
|
|
std::string dbname(pathCStr); |
|
std::string index = dbname + ".index"; |
|
bool err_on_missing_flag = false; |
|
Py_DECREF(path); |
|
|
|
FoldcompDatabaseObject *obj = PyObject_New(FoldcompDatabaseObject, &FoldcompDatabaseType); |
|
if (obj == NULL) { |
|
PyErr_SetString(PyExc_MemoryError, "Could not allocate memory for FoldcompDatabaseObject"); |
|
return NULL; |
|
} |
|
|
|
int mode = DB_READER_USE_DATA; |
|
if (user_ids != NULL && PySequence_Length(user_ids) > 0) { |
|
mode |= DB_READER_USE_LOOKUP; |
|
} |
|
|
|
if (decompress == NULL) { |
|
obj->decompress = true; |
|
} else { |
|
obj->decompress = PyObject_IsTrue(decompress); |
|
} |
|
|
|
if (err_on_missing == NULL) { |
|
err_on_missing_flag = false; |
|
} else { |
|
err_on_missing_flag = PyObject_IsTrue(err_on_missing); |
|
} |
|
|
|
obj->memory_handle = make_reader(dbname.c_str(), index.c_str(), mode); |
|
|
|
obj->user_indices = NULL; |
|
if (user_ids != NULL && PySequence_Length(user_ids) > 0) { |
|
size_t id_count = (size_t)PySequence_Length(user_ids); |
|
// Reserve memory for the user indices |
|
obj->user_indices = new std::vector<int64_t>(); |
|
obj->user_indices->reserve(id_count); |
|
// user_indices.reserve(id_count); |
|
for (Py_ssize_t i = 0; i < (Py_ssize_t)id_count; i++) { |
|
// Iterate over all entries in the database and store ids in a vector of int64_t |
|
PyObject* item = PySequence_GetItem(user_ids, i); |
|
const char* data = PyUnicode_AsUTF8(item); |
|
Py_DECREF(item); |
|
uint32_t key = reader_lookup_entry(obj->memory_handle, data); |
|
int64_t id = reader_get_id(obj->memory_handle, key); |
|
if (id == -1 || key == UINT32_MAX) { |
|
// Not found --> no error just |
|
std::string err_msg = "Skipping entry "; |
|
err_msg += data; |
|
err_msg += " which is not in the database."; |
|
if (err_on_missing_flag) { |
|
Py_DECREF(obj); |
|
PyErr_SetString(PyExc_KeyError, err_msg.c_str()); |
|
return NULL; |
|
} else { |
|
std::cerr << err_msg << std::endl; |
|
continue; |
|
} |
|
} |
|
obj->user_indices->push_back(id); |
|
} |
|
} |
|
|
|
return (PyObject*)obj; |
foldcomp/foldcomp/foldcomp.cxx
Lines 333 to 434 in b97c193
Current behaviour
Currently, there is no way to programmatically capture missing ids via Python interface. The missing ids are printed into non-capturable
stderr.This would be a useful feature, as it will allow to troubleshoot problems.As error might indeed not be the best solution, I think it will be great if missing id's were still returned, but with an empty structure string.
Proposed behaviour