Skip to content

Commit

Permalink
Replace Ruby’s ID interning with C constant pool
Browse files Browse the repository at this point in the history
  • Loading branch information
amomchilov committed Jan 10, 2025
1 parent 991bacb commit 16fd1e6
Show file tree
Hide file tree
Showing 5 changed files with 455 additions and 66 deletions.
24 changes: 16 additions & 8 deletions ext/rbs_extension/location.c
Original file line number Diff line number Diff line change
Expand Up @@ -168,10 +168,13 @@ static VALUE location_end_pos(VALUE self) {
return INT2FIX(loc->rg.end);
}

static rbs_constant_id_t rbs_find_constant_id_from_ruby_symbol(VALUE symbol) {
static rbs_constant_id_t rbs_constant_pool_insert_ruby_symbol(VALUE symbol) {
VALUE name = rb_sym2str(symbol);

return rbs_constant_pool_find(RBS_GLOBAL_CONSTANT_POOL, (const uint8_t *) RSTRING_PTR(name), RSTRING_LEN(name));
// Constants inserted here will never be freed, but that's acceptable because:
// 1. Most symbols passed into here will be the ones already inserted into the constant pool by `parser.c`.
// 2. Methods like `add_required_child` and `add_optional_child` will usually only get called with a few different symbols.
return rbs_constant_pool_insert_constant(RBS_GLOBAL_CONSTANT_POOL, (const uint8_t *) RSTRING_PTR(name), RSTRING_LEN(name));
}

static VALUE location_add_required_child(VALUE self, VALUE name, VALUE start, VALUE end) {
Expand All @@ -181,7 +184,7 @@ static VALUE location_add_required_child(VALUE self, VALUE name, VALUE start, VA
rg.start = rbs_loc_position(FIX2INT(start));
rg.end = rbs_loc_position(FIX2INT(end));

rbs_loc_add_required_child(loc, rbs_find_constant_id_from_ruby_symbol(name), rg);
rbs_loc_add_required_child(loc, rbs_constant_pool_insert_ruby_symbol(name), rg);

return Qnil;
}
Expand All @@ -193,15 +196,15 @@ static VALUE location_add_optional_child(VALUE self, VALUE name, VALUE start, VA
rg.start = rbs_loc_position(FIX2INT(start));
rg.end = rbs_loc_position(FIX2INT(end));

rbs_loc_add_optional_child(loc, rbs_find_constant_id_from_ruby_symbol(name), rg);
rbs_loc_add_optional_child(loc, rbs_constant_pool_insert_ruby_symbol(name), rg);

return Qnil;
}

static VALUE location_add_optional_no_child(VALUE self, VALUE name) {
rbs_loc *loc = rbs_check_location(self);

rbs_loc_add_optional_child(loc, rbs_find_constant_id_from_ruby_symbol(name), NULL_RANGE);
rbs_loc_add_optional_child(loc, rbs_constant_pool_insert_ruby_symbol(name), NULL_RANGE);

return Qnil;
}
Expand All @@ -224,10 +227,16 @@ static VALUE rbs_new_location_from_loc_range(VALUE buffer, rbs_loc_range rg) {
return obj;
}

static rbs_constant_id_t rbs_constant_pool_find_ruby_symbol(VALUE symbol) {
VALUE name = rb_sym2str(symbol);

return rbs_constant_pool_find(RBS_GLOBAL_CONSTANT_POOL, (const uint8_t *) RSTRING_PTR(name), RSTRING_LEN(name));
}

static VALUE location_aref(VALUE self, VALUE name) {
rbs_loc *loc = rbs_check_location(self);

rbs_constant_id_t id = rbs_find_constant_id_from_ruby_symbol(name);
rbs_constant_id_t id = rbs_constant_pool_find_ruby_symbol(name);

if (loc->children != NULL && id != RBS_CONSTANT_ID_UNSET) {
for (unsigned short i = 0; i < loc->children->len; i++) {
Expand All @@ -248,8 +257,7 @@ static VALUE location_aref(VALUE self, VALUE name) {
}

static VALUE rbs_constant_to_ruby_symbol(rbs_constant_t *constant) {
// Casts back the Ruby Symbol that was inserted by `rbs_constant_pool_insert_constant()`.
return (VALUE) constant;
return ID2SYM(rb_intern2((const char *) constant->start, constant->length));
}

static VALUE location_optional_keys(VALUE self) {
Expand Down
12 changes: 11 additions & 1 deletion ext/rbs_extension/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,16 @@ Init_rbs_extension(void)
rbs__init_location();
rbs__init_parser();

rbs_constant_pool_init(RBS_GLOBAL_CONSTANT_POOL, 0);
// Calculated based on the number of unique strings used with the `INTERN` macro in `parser.c`.
//
// ```bash
// grep -o 'INTERN("\([^"]*\)")' ext/rbs_extension/parser.c \
// | sed 's/INTERN("\(.*\)")/\1/' \
// | sort -u \
// | wc -l
// ```
const size_t num_uniquely_interned_strings = 26;
rbs_constant_pool_init(RBS_GLOBAL_CONSTANT_POOL, num_uniquely_interned_strings);

ruby_vm_at_exit(Deinit_rbs_extension);
}
20 changes: 19 additions & 1 deletion ext/rbs_extension/parserstate.c
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,25 @@ parserstate *alloc_parser(VALUE buffer, lexstate *lexer, int start_pos, int end_
.constant_pool = {},
};

rbs_constant_pool_init(&parser->constant_pool, 0);
// The parser's constant pool is mainly used for storing the names of type variables, which usually aren't many.
// Below are some statistics gathered from the current test suite. We can see that 56% of parsers never add to their
// constant pool at all. The initial capacity needs to be a power of 2. Picking 2 means that we won't need to realloc
// in 85% of cases.
//
// TODO: recalculate these statistics based on a real world codebase, rather than the test suite.
//
// | Size | Count | Cumulative | % Coverage |
// |------|-------|------------|------------|
// | 0 | 7,862 | 7,862 | 56% |
// | 1 | 3,196 | 11,058 | 79% |
// | 2 | 778 | 12,719 | 85% |
// | 3 | 883 | 11,941 | 91% |
// | 4 | 478 | 13,197 | 95% |
// | 5 | 316 | 13,513 | 97% |
// | 6 | 288 | 13,801 | 99% |
// | 7 | 144 | 13,945 | 100% |
const size_t initial_pool_capacity = 2;
rbs_constant_pool_init(&parser->constant_pool, initial_pool_capacity);

parser_advance(parser);
parser_advance(parser);
Expand Down
128 changes: 123 additions & 5 deletions include/rbs/util/rbs_constant_pool.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,124 @@
/**
* A constant id is a unique identifier for a constant in the constant pool.
*/
typedef uintptr_t rbs_constant_id_t;
typedef uint32_t rbs_constant_id_t;

/**
* A list of constant IDs. Usually used to represent a set of locals.
*/
typedef struct {
/** The number of constant ids in the list. */
size_t size;

/** The number of constant ids that have been allocated in the list. */
size_t capacity;

/** The constant ids in the list. */
rbs_constant_id_t *ids;
} rbs_constant_id_list_t;

/**
* Initialize a list of constant ids.
*
* @param list The list to initialize.
*/
void rbs_constant_id_list_init(rbs_constant_id_list_t *list);

/**
* Initialize a list of constant ids with a given capacity.
*
* @param list The list to initialize.
* @param capacity The initial capacity of the list.
*/
void rbs_constant_id_list_init_capacity(rbs_constant_id_list_t *list, size_t capacity);

/**
* Append a constant id to a list of constant ids. Returns false if any
* potential reallocations fail.
*
* @param list The list to append to.
* @param id The id to append.
* @return Whether the append succeeded.
*/
bool rbs_constant_id_list_append(rbs_constant_id_list_t *list, rbs_constant_id_t id);

/**
* Insert a constant id into a list of constant ids at the specified index.
*
* @param list The list to insert into.
* @param index The index at which to insert.
* @param id The id to insert.
*/
void rbs_constant_id_list_insert(rbs_constant_id_list_t *list, size_t index, rbs_constant_id_t id);

/**
* Checks if the current constant id list includes the given constant id.
*
* @param list The list to check.
* @param id The id to check for.
* @return Whether the list includes the given id.
*/
bool rbs_constant_id_list_includes(rbs_constant_id_list_t *list, rbs_constant_id_t id);

/**
* Free the memory associated with a list of constant ids.
*
* @param list The list to free.
*/
void rbs_constant_id_list_free(rbs_constant_id_list_t *list);

/**
* The type of bucket in the constant pool hash map. This determines how the
* bucket should be freed.
*/
typedef unsigned int rbs_constant_pool_bucket_type_t;

/** By default, each constant is a slice of the source. */
static const rbs_constant_pool_bucket_type_t RBS_CONSTANT_POOL_BUCKET_DEFAULT = 0;

/** An owned constant is one for which memory has been allocated. */
static const rbs_constant_pool_bucket_type_t RBS_CONSTANT_POOL_BUCKET_OWNED = 1;

/** A constant constant is known at compile time. */
static const rbs_constant_pool_bucket_type_t RBS_CONSTANT_POOL_BUCKET_CONSTANT = 2;

/** A bucket in the hash map. */
typedef struct {
/** The incremental ID used for indexing back into the pool. */
unsigned int id: 30;

/** The type of the bucket, which determines how to free it. */
rbs_constant_pool_bucket_type_t type: 2;

/** The hash of the bucket. */
uint32_t hash;
} rbs_constant_pool_bucket_t;

/** A constant in the pool which effectively stores a string. */
typedef uintptr_t rbs_constant_t;
typedef struct {
/** A pointer to the start of the string. */
const uint8_t *start;

/** The length of the string. */
size_t length;
} rbs_constant_t;

/** The overall constant pool, which stores constants found while parsing. */
typedef struct {
void *dummy; // Workaround for structs not being allowed to be empty.
/** The buckets in the hash map. */
rbs_constant_pool_bucket_t *buckets;

/** The constants that are stored in the buckets. */
rbs_constant_t *constants;

/** The number of buckets in the hash map. */
uint32_t size;

/** The number of buckets that have been allocated in the hash map. */
uint32_t capacity;
} rbs_constant_pool_t;

// A temporary stand-in for the constant pool until start using a real implementation.
// For now, it just defers to Ruby's ID interning mechanism (`rb_intern3`).
// A global constant pool for storing permenant keywords, such as the names of location children in `parser.c`.
extern rbs_constant_pool_t *RBS_GLOBAL_CONSTANT_POOL;

/**
Expand Down Expand Up @@ -80,6 +186,18 @@ rbs_constant_id_t rbs_constant_pool_find(const rbs_constant_pool_t *pool, const
*/
rbs_constant_id_t rbs_constant_pool_insert_shared(rbs_constant_pool_t *pool, const uint8_t *start, size_t length);

/**
* Insert a constant into a constant pool from memory that is now owned by the
* constant pool. Returns the id of the constant, or 0 if any potential calls to
* resize fail.
*
* @param pool The pool to insert the constant into.
* @param start A pointer to the start of the constant.
* @param length The length of the constant.
* @return The id of the constant.
*/
rbs_constant_id_t rbs_constant_pool_insert_owned(rbs_constant_pool_t *pool, uint8_t *start, size_t length);

/**
* Insert a constant into a constant pool from memory that is constant. Returns
* the id of the constant, or 0 if any potential calls to resize fail.
Expand Down
Loading

0 comments on commit 16fd1e6

Please sign in to comment.