Skip to content

Commit

Permalink
use art to optimize pending list construction
Browse files Browse the repository at this point in the history
Summary: In the recursive delete case we will receive notifications
working up to the root of a tree from its leaves.  Our existing pending
list has very limited aggregation of events and will schedule a stat
call for each deleted node.

It is more efficient if we can lop off a portion of the tree when
we add a recursive element at a higher location in the tree.

That's what this diff does.  It makes an invidual add operation a bit
more expensive than it was previously, but the cost is tiny compared
to the savings we get in terms of filesystem activity.

The included benchmark demonstrates this with a synthetic tree
containing around 260k nodes; these are added to a pending list first
from the root to the leaves, and secondly from the leaves to the root.
As part of this, we also generate a call to `w_lstat` to approximate
the activity of our `stat_path` function.

Prior to switching on the ART assisted index, both directions of insert
would occupy approx. 2 seconds of wall time on my macbook.  This
represents the best case hot cache time for the `w_lstat` calls because
the benchmark repeatedly examines the same file.

With this diff we can prune the list of 260k nodes down to 12 nodes, and
we will discover the pruned nodes as part of the recursive tree walk
that will follow this in a real world situation.

Test Plan: `make integration` (enabled -fsanitize=address to prove that
our memory access is good, too).

The benchmark results with this diff (previously, these would take 2 seconds each):

```
 # took 0.011s to insert 12 items into pending coll
 # took 0.251s to reverse insert 12 items into pending coll
```

I also tried a pretty aggressive test where I would alternate between
running:

```
hg sparse -X fooboid
hg st
watchman-diag | arc paste
hg sparse --reset
```

(where fooboid is a large subdirectory of a large repo)
  • Loading branch information
wez committed Jun 3, 2016
1 parent 0e29f40 commit 468042d
Show file tree
Hide file tree
Showing 9 changed files with 393 additions and 28 deletions.
17 changes: 17 additions & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ TESTS = \
tests/argv.t \
tests/bser.t \
tests/ignore.t \
tests/pending.t \
tests/log.t \
tests/wildmatch.t
noinst_PROGRAMS = $(TESTS)
Expand Down Expand Up @@ -219,6 +220,22 @@ tests_ignore_t_SOURCES = \
string.c \
log.c

tests_pending_t_CPPFLAGS = $(THIRDPARTY_CPPFLAGS)
tests_pending_t_LDADD = $(ART_LIB) $(TAP_LIB) $(JSON_LIB)
tests_pending_t_SOURCES = \
tests/pending_test.c \
watcher/helper.c \
hash.c \
ht.c \
ignore.c \
pending.c \
expflags.c \
opendir.c \
cfg.c \
time.c \
string.c \
log.c

tests_log_t_CPPFLAGS = $(THIRDPARTY_CPPFLAGS)
tests_log_t_LDADD = $(JSON_LIB) $(TAP_LIB)
tests_log_t_SOURCES = \
Expand Down
1 change: 1 addition & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@ AC_CHECK_HEADERS(CoreServices/CoreServices.h, [
])
AC_CHECK_FUNCS(backtrace backtrace_symbols backtrace_symbols_fd)
AC_CHECK_FUNCS(sys_siglist)
AC_CHECK_FUNCS(memmem)

if test -n "$ac_cv_header_sys_statvfs_h"; then
AC_CHECK_MEMBERS([struct statvfs.f_fstypename,struct statvfs.f_basetype],
Expand Down
212 changes: 189 additions & 23 deletions pending.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,54 @@ static const struct flag_map kflags[] = {
{0, NULL},
};

// Since the tree has no internal knowledge about path structures, when we
// search for "foo/bar" it may return a prefix match for an existing node
// with the key "foo/bard". We use this function to test whether the string
// exactly matches the input ("foo/bar") or whether it has a slash as the next
// character after the common prefix ("foo/bar/" as a prefix).
static bool is_path_prefix(const char *path, size_t path_len,
size_t common_prefix) {
if (common_prefix == path_len) {
return true;
}

if (common_prefix > path_len) {
return false;
}

return path[common_prefix] == WATCHMAN_DIR_SEP
#ifdef _WIN32
// Windows allows both kinds of slashes
|| path[common_prefix] == '/'
#endif
;
}

// Helper to un-doubly-link a pending item.
static inline void unlink_item(struct watchman_pending_collection *coll,
struct watchman_pending_fs *p) {
if (coll->pending == p) {
coll->pending = p->next;
}
if (p->prev) {
p->prev->next = p->next;
}
if (p->next) {
p->next->prev = p->prev;
}
}

// Helper to doubly-link a pending item to the head of a collection.
static inline void link_head(struct watchman_pending_collection *coll,
struct watchman_pending_fs *p) {
p->prev = NULL;
p->next = coll->pending;
if (coll->pending) {
coll->pending->prev = p;
}
coll->pending = p;
}

/* Free a pending_fs node */
void w_pending_fs_free(struct watchman_pending_fs *p) {
w_string_delref(p->path);
Expand All @@ -20,23 +68,20 @@ void w_pending_fs_free(struct watchman_pending_fs *p) {
bool w_pending_coll_init(struct watchman_pending_collection *coll) {
coll->pending = NULL;
coll->pinged = false;
coll->pending_uniq = w_ht_new(WATCHMAN_BATCH_LIMIT, &w_ht_string_funcs);
if (!coll->pending_uniq) {
return false;
}
if (pthread_mutex_init(&coll->lock, NULL)) {
return false;
}
if (pthread_cond_init(&coll->cond, NULL)) {
return false;
}
art_tree_init(&coll->tree);
return true;
}

/* destroy a pending_coll */
void w_pending_coll_destroy(struct watchman_pending_collection *coll) {
w_pending_coll_drain(coll);
w_ht_free(coll->pending_uniq);
art_tree_destroy(&coll->tree);
pthread_mutex_destroy(&coll->lock);
pthread_cond_destroy(&coll->cond);
}
Expand All @@ -49,7 +94,8 @@ void w_pending_coll_drain(struct watchman_pending_collection *coll) {
w_pending_fs_free(p);
}

w_ht_free_entries(coll->pending_uniq);
art_tree_destroy(&coll->tree);
art_tree_init(&coll->tree);
}

/* compute a deadline on entry, then obtain the collection lock
Expand Down Expand Up @@ -98,14 +144,122 @@ void w_pending_coll_unlock(struct watchman_pending_collection *coll) {
}
}

static inline void consolidate_item(struct watchman_pending_fs *p,
int flags) {
// Deletion is a bit awkward in this radix tree implementation.
// We can't recursively delete a given prefix as a built-in operation
// and it is non-trivial to add that functionality right now.
// When we lop-off a portion of a tree that we're going to analyze
// recursively, we have to iterate each leaf and explicitly delete
// that leaf.
// Since deletion invalidates the iteration state we have to signal
// to stop iteration after each deletion and then retry the prefix
// deletion.
// We use this kid_context state to pass down the required information
// to the iterator callback so that we adjust the overall state correctly.
struct kid_context {
w_string_t *root;
struct watchman_pending_collection *coll;
};

// We need to guarantee that we never collapse a cookie notification
// out of the pending list, because we absolutely must observe it coming
// in via the kernel notification mechanism in order for synchronization
// to be correct.
// Since we don't have a w_root_t available, we can't tell what the
// precise cookie prefix is for the current pending list here, so
// we do a substring match. Not the most elegant thing in the world.
static inline bool is_possibly_a_cookie(w_string_t *path) {
return w_string_contains_cstr_len(path, WATCHMAN_COOKIE_PREFIX,
sizeof(WATCHMAN_COOKIE_PREFIX) - 1);
}

// This is the iterator callback we use to prune out obsoleted leaves.
// We need to compare the prefix to make sure that we don't delete
// a sibling node by mistake (see commentary on the is_path_prefix
// function for more on that).
static int delete_kids(void *data, const unsigned char *key, uint32_t key_len,
void *value) {
struct kid_context *ctx = data;
struct watchman_pending_fs *p = value;
unused_parameter(value);

if ((p->flags & W_PENDING_CRAWL_ONLY) == 0 && key_len > ctx->root->len &&
is_path_prefix((const char *)key, key_len, ctx->root->len) &&
!is_possibly_a_cookie(p->path)) {
// Unlink the child from the pending index.
unlink_item(ctx->coll, p);
// and completely free it.

w_pending_fs_free(p);

// Remove it from the art tree also.
art_delete(&ctx->coll->tree, key, key_len);

// Stop iteration because we just invalidated the iterator state
// by modifying the tree mid-iteration.
return 1;
}

return 0;
}

// if there are any entries that are obsoleted by a recursive insert,
// walk over them now and mark them as ignored.
static void
maybe_prune_obsoleted_children(struct watchman_pending_collection *coll,
w_string_t *path, int flags) {
if ((flags & (W_PENDING_RECURSIVE | W_PENDING_CRAWL_ONLY)) ==
W_PENDING_RECURSIVE) {
struct kid_context ctx = {path, coll};
// Since deletion invalidates the iterator, we need to repeatedly
// call this to prune out the nodes. It will return 0 once no
// matching prefixes are found and deleted.
while (art_iter_prefix(&coll->tree, (const uint8_t *)path->buf, path->len,
delete_kids, &ctx)) {
// OK; try again
;
}
}
}

static inline void consolidate_item(struct watchman_pending_collection *coll,
struct watchman_pending_fs *p, int flags) {
// Increase the strength of the pending item if either of these
// flags are set.
// We upgrade crawl-only as well as recursive; it indicates that
// we've recently just performed the stat and we want to avoid
// infinitely trying to stat-and-crawl
p->flags |= flags & (W_PENDING_CRAWL_ONLY|W_PENDING_RECURSIVE);

maybe_prune_obsoleted_children(coll, p->path, p->flags);
}

// Check the tree to see if there is a path that is earlier/higher in the
// filesystem than the input path; if there is, and it is recursive,
// return true to indicate that there is no need to track this new path
// due to the already scheduled higher level path.
static bool
is_obsoleted_by_containing_dir(struct watchman_pending_collection *coll,
w_string_t *path) {
struct watchman_pending_fs *p;
art_leaf *leaf =
art_longest_match(&coll->tree, (const uint8_t *)path->buf, path->len);
if (!leaf) {
return false;
}
p = leaf->value;

if (p->flags & W_PENDING_RECURSIVE &&
is_path_prefix(path->buf, path->len, leaf->key_len)) {

if (is_possibly_a_cookie(path)) {
return false;
}

// Yes: the pre-existing entry higher up in the tree obsoletes this
// one that we would add now.
return true;
}
return false;
}

/* add a pending entry. Will consolidate an existing entry with the
Expand All @@ -116,19 +270,26 @@ bool w_pending_coll_add(struct watchman_pending_collection *coll,
struct watchman_pending_fs *p;
char flags_label[128];

p = w_ht_val_ptr(w_ht_get(coll->pending_uniq, w_ht_ptr_val(path)));
p = art_search(&coll->tree, (const unsigned char*)path->buf, path->len);
if (p) {
/* Entry already exists: consolidate */
consolidate_item(p, flags);
consolidate_item(coll, p, flags);
/* all done */
return true;
}

if (is_obsoleted_by_containing_dir(coll, path)) {
return true;
}

// Try to allocate the new node before we prune any children.
p = calloc(1, sizeof(*p));
if (!p) {
return false;
}

maybe_prune_obsoleted_children(coll, path, flags);

w_expand_flags(kflags, flags, flags_label, sizeof(flags_label));
w_log(W_LOG_DBG, "add_pending: %.*s %s\n", path->len, path->buf, flags_label);

Expand All @@ -137,9 +298,8 @@ bool w_pending_coll_add(struct watchman_pending_collection *coll,
p->path = path;
w_string_addref(path);

p->next = coll->pending;
coll->pending = p;
w_ht_set(coll->pending_uniq, w_ht_ptr_val(path), w_ht_ptr_val(p));
link_head(coll, p);
art_insert(&coll->tree, (const uint8_t *)path->buf, path->len, p);

return true;
}
Expand Down Expand Up @@ -169,21 +329,28 @@ void w_pending_coll_append(struct watchman_pending_collection *target,
struct watchman_pending_fs *p, *target_p;

while ((p = w_pending_coll_pop(src)) != NULL) {
target_p = w_ht_val_ptr(w_ht_get(target->pending_uniq,
w_ht_ptr_val(p->path)));
target_p = art_search(&target->tree,
(const uint8_t *)p->path->buf, p->path->len);
if (target_p) {
/* Entry already exists: consolidate */
consolidate_item(target_p, p->flags);
consolidate_item(target, target_p, p->flags);
w_pending_fs_free(p);
continue;
}

p->next = target->pending;
target->pending = p;
w_ht_set(target->pending_uniq, w_ht_ptr_val(p->path), w_ht_ptr_val(p));
if (is_obsoleted_by_containing_dir(target, p->path)) {
w_pending_fs_free(p);
continue;
}
maybe_prune_obsoleted_children(target, p->path, p->flags);

link_head(target, p);
art_insert(&target->tree, (const uint8_t *)p->path->buf, p->path->len, p);
}

w_ht_free_entries(src->pending_uniq);
// Empty the src tree and reset it
art_tree_destroy(&src->tree);
art_tree_init(&src->tree);
src->pending = NULL;
}

Expand All @@ -196,16 +363,15 @@ struct watchman_pending_fs *w_pending_coll_pop(
struct watchman_pending_fs *p = coll->pending;

if (p) {
coll->pending = p->next;
p->next = NULL;
unlink_item(coll, p);
}

return p;
}

/* Returns the number of unique pending items in the collection */
uint32_t w_pending_coll_size(struct watchman_pending_collection *coll) {
return w_ht_size(coll->pending_uniq);
return (uint32_t)art_size(&coll->tree);
}

/* vim:ts=2:sw=2:et:
Expand Down
4 changes: 2 additions & 2 deletions root.c
Original file line number Diff line number Diff line change
Expand Up @@ -498,12 +498,12 @@ bool w_root_process_pending(w_root_t *root,
}

w_log(W_LOG_DBG, "processing %d events in %s\n",
w_ht_size(coll->pending_uniq), root->root_path->buf);
w_pending_coll_size(coll), root->root_path->buf);

// Steal the contents
pending = coll->pending;
coll->pending = NULL;
w_ht_free_entries(coll->pending_uniq);
w_pending_coll_drain(coll);

while (pending) {
p = pending;
Expand Down
26 changes: 26 additions & 0 deletions string.c
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,32 @@ bool w_string_startswith_caseless(w_string_t *str, w_string_t *prefix)
return true;
}

bool w_string_contains_cstr_len(w_string_t *str, const char *needle,
uint32_t nlen) {
#if HAVE_MEMMEM
return memmem(str->buf, str->len, needle, nlen) != NULL;
#else
// Most likely only for Windows.
// Inspired by http://stackoverflow.com/a/24000056/149111
const char *haystack = str->buf;
uint32_t hlen = str->len;
const char *limit;

if (nlen == 0 || hlen < nlen) {
return false;
}

limit = haystack + hlen - nlen + 1;
while ((haystack = memchr(haystack, needle[0], limit - haystack)) != NULL) {
if (memcmp(haystack, needle, nlen) == 0) {
return true;
}
haystack++;
}
return false;
#endif
}

w_string_t *w_string_canon_path(w_string_t *str)
{
int end;
Expand Down
Loading

0 comments on commit 468042d

Please sign in to comment.