Hacker News

That was my plan, but I haven't yet--the fix was in my renamed code and I haven't put in the work to make it correspond to the original code. But here's the commit message, maybe you can see it easily:

    Fix: Use correct stack index when adjusting cursors in mdb_cursor_del0
    
    In `mdb_cursor_del0`, the second cursor adjustment loop, which runs after
    `mdb_rebalance`, contained a latent bug that could lead to memory corruption or
    crashes.
    
    The Problem: The loop iterates through all active cursors to update their
    positions after a deletion. The logic correctly checks if another cursor
    (`cursor_to_update`) points to the same page as the deleting cursor (`cursor`)
    at the same stack level: `if
    (cursor_to_update->mc_page_stack[cursor->mc_stack_top_idx] == page_ptr)`
    
    However, inside this block, when retrieving the `MDB_node` pointer to update a
    sub-cursor, the code incorrectly used the other cursor's own stack top as the
    index:
    `PAGE_GET_NODE_PTR(cursor_to_update->mc_page_stack[cursor_to_update->mc_stack_to
    p_idx], ...)`
    
    If `cursor_to_update` had a deeper stack than `cursor` (e.g., it was a cursor
    on a sub-database), `cursor_to_update->mc_stack_top_idx` would be greater than
    `cursor->mc_stack_top_idx`. This caused the code to access a page pointer from
    a completely different (and deeper) level of `cursor_to_update`'s stack than
    the level that was just validated in the parent `if` condition. Accessing this
    out-of-context page pointer could lead to memory corruption, segmentation
    faults, or other unpredictable behavior.
    
    The Solution: This commit corrects the inconsistency by using the deleting
    cursor's stack index (`cursor->mc_stack_top_idx`) for all accesses to
    `cursor_to_update`'s stacks within this logic block. This ensures that the node
    pointer is retrieved from the same B-tree level that the surrounding code is
    operating on, resolving the data corruption risk and making the logic
    internally consistent.

And here's the function with renames (and the fix):

    struct MDB_cursor {
     /** Next cursor on this DB in this txn */
     MDB_cursor *mc_next_cursor_ptr;
     /** Backup of the original cursor if this cursor is a shadow */
     MDB_cursor *mc_backup_ptr;
     /** Context used for databases with #MDB_DUPSORT, otherwise NULL */
     struct MDB_xcursor *mc_sub_cursor_ctx_ptr;
     /** The transaction that owns this cursor */
     MDB_txn  *mc_txn_ptr;
     /** The database handle this cursor operates on */
     MDB_dbi  mc_dbi;
     /** The database record for this cursor */
     MDB_db  *mc_db_ptr;
     /** The database auxiliary record for this cursor */
     MDB_dbx  *mc_dbx_ptr;
     /** The @ref mt_dbflag for this database */
     unsigned char *mc_dbi_flags_ptr;
     unsigned short  mc_stack_depth; /**< number of pushed pages */
     unsigned short mc_stack_top_idx;  /**< index of top page, normally mc_stack_depth-1 */
    /** @defgroup mdb_cursor Cursor Flags
     * @ingroup internal
     * Cursor state flags.
     * @{
     */
    #define CURSOR_IS_INITIALIZED 0x01 /**< cursor has been initialized and is valid */
    #define CURSOR_AT_EOF 0x02   /**< No more data */
    #define CURSOR_IS_SUB_CURSOR 0x04   /**< Cursor is a sub-cursor */
    #define CURSOR_JUST_DELETED 0x08   /**< last op was a cursor_del */
    #define CURSOR_IS_IN_WRITE_TXN_TRACKING_LIST 0x40  /**< Un-track cursor when closing */
    #define CURSOR_IN_WRITE_MAP_TXN TXN_WRITE_MAP /**< Copy of txn flag */
    /** Read-only cursor into the txn's original snapshot in the map.
     * Set for read-only txns, and in #mdb_page_alloc() for #FREE_DBI when
     * #MDB_DEVEL & 2. Only implements code which is necessary for this.
     */
    #define CURSOR_IS_READ_ONLY_SNAPSHOT TXN_READ_ONLY
    /** @} */
     unsigned int mc_flags; /**< @ref mdb_cursor */
     MDB_page *mc_page_stack[CURSOR_STACK]; /**< stack of pushed pages */
     indx_t  mc_index_stack[CURSOR_STACK]; /**< stack of page indices */
    #ifdef MDB_VL32
     MDB_page *mc_vl32_overflow_page_ptr;  /**< a referenced overflow page */
    # define CURSOR_OVERFLOW_PAGE_PTR(cursor)   ((cursor)->mc_vl32_overflow_page_ptr)
    # define CURSOR_SET_OVERFLOW_PAGE_PTR(cursor, page_ptr) ((cursor)->mc_vl32_overflow_page_ptr = (page_ptr))
    #else
    # define CURSOR_OVERFLOW_PAGE_PTR(cursor)   ((MDB_page *)0)
    # define CURSOR_SET_OVERFLOW_PAGE_PTR(cursor, page_ptr) ((void)0)
    #endif
    };


    /** @brief Complete a delete operation by removing a node and rebalancing.
     *
     * This function is called after the preliminary checks in _mdb_cursor_del().
     * It performs the physical node deletion, decrements the entry count, adjusts
     * all other cursors affected by the deletion, and then calls mdb_rebalance()
     * to ensure B-tree invariants are maintained.
     *
     * @param[in,out] cursor The cursor positioned at the item to delete.
     * @return 0 on success, or a non-zero error code on failure.
     */
    static int
    mdb_cursor_del0(MDB_cursor *cursor)
    {
     int rc;
     MDB_page *page_ptr;
     indx_t node_idx_to_delete;
     unsigned int num_keys_after_delete;
     MDB_cursor *cursor_iter, *cursor_to_update;
     MDB_dbi dbi = cursor->mc_dbi;
    
     node_idx_to_delete = cursor->mc_index_stack[cursor->mc_stack_top_idx];
     page_ptr = cursor->mc_page_stack[cursor->mc_stack_top_idx];
     
     // 1. Physically delete the node from the page.
     mdb_node_del(cursor, cursor->mc_db_ptr->md_leaf2_key_size);
     cursor->mc_db_ptr->md_entry_count--;
    
     // 2. Adjust other cursors pointing to the same page.
     for (cursor_iter = cursor->mc_txn_ptr->mt_cursors_array_ptr[dbi]; cursor_iter; cursor_iter = cursor_iter->mc_next_cursor_ptr) {
      cursor_to_update = (cursor->mc_flags & CURSOR_IS_SUB_CURSOR) ? &cursor_iter->mc_sub_cursor_ctx_ptr->mx_cursor : cursor_iter;
      if (!(cursor_iter->mc_flags & cursor_to_update->mc_flags & CURSOR_IS_INITIALIZED)) continue;
      if (cursor_to_update == cursor || cursor_to_update->mc_stack_depth < cursor->mc_stack_depth) continue;
    
      if (cursor_to_update->mc_page_stack[cursor->mc_stack_top_idx] == page_ptr) {
       if (cursor_to_update->mc_index_stack[cursor->mc_stack_top_idx] == node_idx_to_delete) {
        // This cursor pointed to the exact node we deleted.
        cursor_to_update->mc_flags |= CURSOR_JUST_DELETED;
        if (cursor->mc_db_ptr->md_flags & MDB_DUPSORT) {
         cursor_to_update->mc_sub_cursor_ctx_ptr->mx_cursor.mc_flags &= ~(CURSOR_IS_INITIALIZED | CURSOR_AT_EOF);
        }
        continue;
       } else if (cursor_to_update->mc_index_stack[cursor->mc_stack_top_idx] > node_idx_to_delete) {
        // This cursor pointed after the deleted node; shift its index down.
        cursor_to_update->mc_index_stack[cursor->mc_stack_top_idx]--;
       }
       XCURSOR_REFRESH(cursor_to_update, cursor->mc_stack_top_idx, page_ptr);
      }
     }
    
     // 3. Rebalance the tree, which may merge or borrow from sibling pages.
     rc = mdb_rebalance(cursor);
     if (rc) goto fail;
    
     if (!cursor->mc_stack_depth) { // Tree is now empty.
      cursor->mc_flags |= CURSOR_AT_EOF;
      return rc;
     }
    
     // 4. Perform a second cursor adjustment pass. This is needed because rebalancing
     //    (specifically page merges) can further change cursor positions.
     page_ptr = cursor->mc_page_stack[cursor->mc_stack_top_idx];
     num_keys_after_delete = NUMKEYS(page_ptr);
    
     for (cursor_iter = cursor->mc_txn_ptr->mt_cursors_array_ptr[dbi]; !rc && cursor_iter; cursor_iter = cursor_iter->mc_next_cursor_ptr) {
      cursor_to_update = (cursor->mc_flags & CURSOR_IS_SUB_CURSOR) ? &cursor_iter->mc_sub_cursor_ctx_ptr->mx_cursor : cursor_iter;
      if (!(cursor_iter->mc_flags & cursor_to_update->mc_flags & CURSOR_IS_INITIALIZED)) continue;
      if (cursor_to_update->mc_stack_depth < cursor->mc_stack_depth) continue;
      
      if (cursor_to_update->mc_page_stack[cursor->mc_stack_top_idx] == page_ptr) {
       if (cursor_to_update->mc_index_stack[cursor->mc_stack_top_idx] >= cursor->mc_index_stack[cursor->mc_stack_top_idx]) {
        // If cursor is now positioned past the end of the page, move it to the next sibling.
        if (cursor_to_update->mc_index_stack[cursor->mc_stack_top_idx] >= num_keys_after_delete) {
         rc = mdb_cursor_sibling(cursor_to_update, 1);
         if (rc == MDB_NOTFOUND) {
          cursor_to_update->mc_flags |= CURSOR_AT_EOF;
          rc = MDB_SUCCESS;
          continue;
         }
         if (rc) goto fail;
        }
        if (cursor_to_update->mc_sub_cursor_ctx_ptr && !(cursor_to_update->mc_flags & CURSOR_AT_EOF)) {
                        // BUG FIX: Use the main cursor's stack index to access the other cursor's stacks.
                        // This ensures we are retrieving the node from the same B-tree level
                        // that the parent `if` condition already checked. The previous code used
                        // `cursor_to_update->mc_stack_top_idx`, which could be incorrect if its
                        // stack was deeper than the main cursor's.
         MDB_node *node_ptr = PAGE_GET_NODE_PTR(cursor_to_update->mc_page_stack[cursor->mc_stack_top_idx], cursor_to_update->mc_index_stack[cursor->mc_stack_top_idx]);
         if (node_ptr->mn_flags & NODE_DUPLICATE_DATA) {
          if (cursor_to_update->mc_sub_cursor_ctx_ptr->mx_cursor.mc_flags & CURSOR_IS_INITIALIZED) {
           if (!(node_ptr->mn_flags & NODE_SUB_DATABASE))
            cursor_to_update->mc_sub_cursor_ctx_ptr->mx_cursor.mc_page_stack[0] = NODE_GET_DATA_PTR(node_ptr);
          } else {
           mdb_xcursor_init1(cursor_to_update, node_ptr);
           rc = mdb_cursor_first(&cursor_to_update->mc_sub_cursor_ctx_ptr->mx_cursor, NULL, NULL);
           if (rc) goto fail;
          }
         }
         cursor_to_update->mc_sub_cursor_ctx_ptr->mx_cursor.mc_flags |= CURSOR_JUST_DELETED;
        }
       }
      }
     }
     
     cursor->mc_flags |= CURSOR_JUST_DELETED;
    
    fail:
     if (rc)
      cursor->mc_txn_ptr->mt_flags |= TXN_HAS_ERROR;
     return rc;
    }

I have confirmed the fixed logic seems correct, but I haven't written a test for it (I moved on to another project immediately after and haven't returned back to this one). That said…I'm almost certain I have run into this bug in production on a large (1TB) database that used DUPSORT heavily. It's kinda hard to trigger.

Also, thanks for a great library! LMDB is fantastic.

dagmx 2 days ago [ - ]

While your lldb use is interesting,, I’m very much interested in what you said you’re doing greenfield. What is the scope of that?

Clojure has enough code out there that it’s well covered by the major LLMs.

hyc_symas 2 days ago [ - ]

What's the bug ID, have you submitted the fix back?

erichocean 2 days ago [ - ]

hyc_symas a day ago [ - ]

Thanks for that. It doesn't look like there is an open bug report for this yet.

I understand your commit description but I'm still a bit puzzled by it. cursor_to_update shouldn't have a deeper stack than cursor, even if it was a cursor on a subdatabase. All the references to the subdatabase are in the subcursor, and that has no bearing on the main cursor's stack.

The original code with the bug was added for ITS#8406, commit 37081325f7356587c5e6ce4c1f36c3b303fa718c on 2016-04-18. Definitely a rare occurrence since it's uncommon to write from multiple cursors on the same DB. Fixed now in ITS#10396.

erichocean an hour ago [ - ]

> Fixed now in ITS#10396.

Thanks, appreciated.