Skip to content

foldcomp.open - return empty structure string if structure is not found in database #54

@valentynbez

Description

@valentynbez

static PyObject *foldcomp_open(PyObject* /* self */, PyObject* args, PyObject* kwargs) {
PyObject* path;
PyObject* user_ids = NULL;
PyObject* decompress = NULL;
PyObject* err_on_missing = NULL; // Raise an error if the file is missing. Default: False
static const char *kwlist[] = {"path", "ids", "decompress", "err_on_missing", NULL};
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O&|$OOO", const_cast<char**>(kwlist), PyUnicode_FSConverter, &path, &user_ids, &decompress, &err_on_missing)) {
return NULL;
}
if (path == NULL) {
PyErr_SetString(PyExc_TypeError, "path must be a path-like object");
return NULL;
}
const char* pathCStr = PyBytes_AS_STRING(path);
if (pathCStr == NULL) {
Py_DECREF(path);
PyErr_SetString(PyExc_TypeError, "path must be a path-like object");
return NULL;
}
if (user_ids != NULL && !PyList_Check(user_ids)) {
Py_DECREF(path);
PyErr_SetString(PyExc_TypeError, "user_ids must be a list.");
return NULL;
}
if (decompress != NULL && !PyBool_Check(decompress)) {
Py_DECREF(path);
PyErr_SetString(PyExc_TypeError, "decompress must be a boolean");
return NULL;
}
if (err_on_missing != NULL && !PyBool_Check(err_on_missing)) {
Py_DECREF(path);
PyErr_SetString(PyExc_TypeError, "err_on_missing must be a boolean");
return NULL;
}
std::string dbname(pathCStr);
std::string index = dbname + ".index";
bool err_on_missing_flag = false;
Py_DECREF(path);
FoldcompDatabaseObject *obj = PyObject_New(FoldcompDatabaseObject, &FoldcompDatabaseType);
if (obj == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate memory for FoldcompDatabaseObject");
return NULL;
}
int mode = DB_READER_USE_DATA;
if (user_ids != NULL && PySequence_Length(user_ids) > 0) {
mode |= DB_READER_USE_LOOKUP;
}
if (decompress == NULL) {
obj->decompress = true;
} else {
obj->decompress = PyObject_IsTrue(decompress);
}
if (err_on_missing == NULL) {
err_on_missing_flag = false;
} else {
err_on_missing_flag = PyObject_IsTrue(err_on_missing);
}
obj->memory_handle = make_reader(dbname.c_str(), index.c_str(), mode);
obj->user_indices = NULL;
if (user_ids != NULL && PySequence_Length(user_ids) > 0) {
size_t id_count = (size_t)PySequence_Length(user_ids);
// Reserve memory for the user indices
obj->user_indices = new std::vector<int64_t>();
obj->user_indices->reserve(id_count);
// user_indices.reserve(id_count);
for (Py_ssize_t i = 0; i < (Py_ssize_t)id_count; i++) {
// Iterate over all entries in the database and store ids in a vector of int64_t
PyObject* item = PySequence_GetItem(user_ids, i);
const char* data = PyUnicode_AsUTF8(item);
Py_DECREF(item);
uint32_t key = reader_lookup_entry(obj->memory_handle, data);
int64_t id = reader_get_id(obj->memory_handle, key);
if (id == -1 || key == UINT32_MAX) {
// Not found --> no error just
std::string err_msg = "Skipping entry ";
err_msg += data;
err_msg += " which is not in the database.";
if (err_on_missing_flag) {
Py_DECREF(obj);
PyErr_SetString(PyExc_KeyError, err_msg.c_str());
return NULL;
} else {
std::cerr << err_msg << std::endl;
continue;
}
}
obj->user_indices->push_back(id);
}
}
return (PyObject*)obj;

Current behaviour

Currently, there is no way to programmatically capture missing ids via Python interface. The missing ids are printed into non-capturable stderr .This would be a useful feature, as it will allow to troubleshoot problems.
As error might indeed not be the best solution, I think it will be great if missing id's were still returned, but with an empty structure string.

ids = ["valid", "invalid"]
with foldcomp.open(database_path, ids=ids) as db:
    print("Here!") 
    for idx, pdb in db:
        print(idx, pdb)
Skipping entry invalid which is not in the database.
Here! 
valid {pdb_string}

Proposed behaviour

ids = ["invalid"]
with foldcomp.open(database_path, ids=ids) as db:
    print("Here!")
    for idx, pdb in db:
        print(idx, pdb)
Here!
valid {pdb_string}
invalid 

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions