diff --git a/docs/posts/2020-06-07-correlation-parameter-mem/index.html b/docs/posts/2020-06-07-correlation-parameter-mem/index.html
index 235be5f..d1af096 100644
--- a/docs/posts/2020-06-07-correlation-parameter-mem/index.html
+++ b/docs/posts/2020-06-07-correlation-parameter-mem/index.html
@@ -2612,7 +2612,7 @@
${suggestion.title}
diff --git a/docs/posts/2020-06-25-indexing-tip-for-spacyr/index.html b/docs/posts/2020-06-25-indexing-tip-for-spacyr/index.html
index ef2bcf8..523774d 100644
--- a/docs/posts/2020-06-25-indexing-tip-for-spacyr/index.html
+++ b/docs/posts/2020-06-25-indexing-tip-for-spacyr/index.html
@@ -2612,7 +2612,7 @@ ${suggestion.title}
diff --git a/docs/posts/2020-06-30-treemap-with-ggplot/index.html b/docs/posts/2020-06-30-treemap-with-ggplot/index.html
index 8dc3d11..10c6344 100644
--- a/docs/posts/2020-06-30-treemap-with-ggplot/index.html
+++ b/docs/posts/2020-06-30-treemap-with-ggplot/index.html
@@ -2612,7 +2612,7 @@ ${suggestion.title}
diff --git a/docs/posts/2020-07-13-geom-paired-raincloud/index.html b/docs/posts/2020-07-13-geom-paired-raincloud/index.html
index 6e73677..4f7aa61 100644
--- a/docs/posts/2020-07-13-geom-paired-raincloud/index.html
+++ b/docs/posts/2020-07-13-geom-paired-raincloud/index.html
@@ -2612,7 +2612,7 @@ ${suggestion.title}
diff --git a/docs/posts/2020-07-20-shiny-tips-1/index.html b/docs/posts/2020-07-20-shiny-tips-1/index.html
index 5b7d450..349abd6 100644
--- a/docs/posts/2020-07-20-shiny-tips-1/index.html
+++ b/docs/posts/2020-07-20-shiny-tips-1/index.html
@@ -2612,7 +2612,7 @@ ${suggestion.title}
diff --git a/docs/posts/2020-07-29-six-years-of-my-spotify-playlists/index.html b/docs/posts/2020-07-29-six-years-of-my-spotify-playlists/index.html
index bd47814..e26700e 100644
--- a/docs/posts/2020-07-29-six-years-of-my-spotify-playlists/index.html
+++ b/docs/posts/2020-07-29-six-years-of-my-spotify-playlists/index.html
@@ -2612,7 +2612,7 @@ ${suggestion.title}
diff --git a/docs/posts/2020-08-04-tidytuesday-2020-week-32/index.html b/docs/posts/2020-08-04-tidytuesday-2020-week-32/index.html
index eccc2f4..7eb39c1 100644
--- a/docs/posts/2020-08-04-tidytuesday-2020-week-32/index.html
+++ b/docs/posts/2020-08-04-tidytuesday-2020-week-32/index.html
@@ -2612,7 +2612,7 @@ ${suggestion.title}
diff --git a/docs/posts/2020-08-07-saving-a-line-of-piping/index.html b/docs/posts/2020-08-07-saving-a-line-of-piping/index.html
index 4e76474..12258ea 100644
--- a/docs/posts/2020-08-07-saving-a-line-of-piping/index.html
+++ b/docs/posts/2020-08-07-saving-a-line-of-piping/index.html
@@ -2618,7 +2618,7 @@ ${suggestion.title}
diff --git a/docs/posts/2020-08-17-tidytuesday-2020-week-33/index.html b/docs/posts/2020-08-17-tidytuesday-2020-week-33/index.html
index 78535ce..8707146 100644
--- a/docs/posts/2020-08-17-tidytuesday-2020-week-33/index.html
+++ b/docs/posts/2020-08-17-tidytuesday-2020-week-33/index.html
@@ -2612,7 +2612,7 @@ ${suggestion.title}
diff --git a/docs/posts/2020-09-06-fonts-for-graphs/index.html b/docs/posts/2020-09-06-fonts-for-graphs/index.html
index 4a04319..5b54af5 100644
--- a/docs/posts/2020-09-06-fonts-for-graphs/index.html
+++ b/docs/posts/2020-09-06-fonts-for-graphs/index.html
@@ -2612,7 +2612,7 @@ ${suggestion.title}
diff --git a/docs/posts/2020-09-12-videos-in-reactable/index.html b/docs/posts/2020-09-12-videos-in-reactable/index.html
index 9204ef8..b60a727 100644
--- a/docs/posts/2020-09-12-videos-in-reactable/index.html
+++ b/docs/posts/2020-09-12-videos-in-reactable/index.html
@@ -2623,7 +2623,7 @@ ${suggestion.title}
diff --git a/docs/posts/2020-09-14-tidytuesday-2020-week-38/index.html b/docs/posts/2020-09-14-tidytuesday-2020-week-38/index.html
index e025667..1606eb8 100644
--- a/docs/posts/2020-09-14-tidytuesday-2020-week-38/index.html
+++ b/docs/posts/2020-09-14-tidytuesday-2020-week-38/index.html
@@ -2612,7 +2612,7 @@ ${suggestion.title}
diff --git a/docs/posts/2020-09-20-plot-makeover-1/index.html b/docs/posts/2020-09-20-plot-makeover-1/index.html
index ebaebaa..ecef8cd 100644
--- a/docs/posts/2020-09-20-plot-makeover-1/index.html
+++ b/docs/posts/2020-09-20-plot-makeover-1/index.html
@@ -2619,7 +2619,7 @@ ${suggestion.title}
diff --git a/docs/posts/2020-09-23-tidytuesday-2020-week-39/index.html b/docs/posts/2020-09-23-tidytuesday-2020-week-39/index.html
index 69f0523..6262274 100644
--- a/docs/posts/2020-09-23-tidytuesday-2020-week-39/index.html
+++ b/docs/posts/2020-09-23-tidytuesday-2020-week-39/index.html
@@ -2612,7 +2612,7 @@ ${suggestion.title}
diff --git a/docs/posts/2020-09-26-demystifying-stat-layers-ggplot2/index.html b/docs/posts/2020-09-26-demystifying-stat-layers-ggplot2/index.html
index 6a62f3a..ea8d29f 100644
--- a/docs/posts/2020-09-26-demystifying-stat-layers-ggplot2/index.html
+++ b/docs/posts/2020-09-26-demystifying-stat-layers-ggplot2/index.html
@@ -2614,7 +2614,7 @@ ${suggestion.title}
diff --git a/docs/posts/2020-10-13-designing-guiding-aesthetics/index.html b/docs/posts/2020-10-13-designing-guiding-aesthetics/index.html
index 19b875c..d68251c 100644
--- a/docs/posts/2020-10-13-designing-guiding-aesthetics/index.html
+++ b/docs/posts/2020-10-13-designing-guiding-aesthetics/index.html
@@ -2617,7 +2617,7 @@ ${suggestion.title}
diff --git a/docs/posts/2020-10-22-analysis-of-everycolorbots-tweets/index.html b/docs/posts/2020-10-22-analysis-of-everycolorbots-tweets/index.html
index 6d0342d..acd94eb 100644
--- a/docs/posts/2020-10-22-analysis-of-everycolorbots-tweets/index.html
+++ b/docs/posts/2020-10-22-analysis-of-everycolorbots-tweets/index.html
@@ -2618,7 +2618,7 @@ ${suggestion.title}
diff --git a/docs/posts/2020-10-28-tidytuesday-2020-week-44/index.html b/docs/posts/2020-10-28-tidytuesday-2020-week-44/index.html
index 300c6f6..a705b52 100644
--- a/docs/posts/2020-10-28-tidytuesday-2020-week-44/index.html
+++ b/docs/posts/2020-10-28-tidytuesday-2020-week-44/index.html
@@ -2612,7 +2612,7 @@ ${suggestion.title}
diff --git a/docs/posts/2020-11-03-tidytuesday-2020-week-45/index.html b/docs/posts/2020-11-03-tidytuesday-2020-week-45/index.html
index c00c2d5..79bf13b 100644
--- a/docs/posts/2020-11-03-tidytuesday-2020-week-45/index.html
+++ b/docs/posts/2020-11-03-tidytuesday-2020-week-45/index.html
@@ -2612,7 +2612,7 @@ ${suggestion.title}
diff --git a/docs/posts/2020-11-08-plot-makeover-2/index.html b/docs/posts/2020-11-08-plot-makeover-2/index.html
index 469145f..b4f67c1 100644
--- a/docs/posts/2020-11-08-plot-makeover-2/index.html
+++ b/docs/posts/2020-11-08-plot-makeover-2/index.html
@@ -2625,7 +2625,7 @@ ${suggestion.title}
diff --git a/docs/posts/2020-12-13-collapse-repetitive-piping-with-reduce/index.html b/docs/posts/2020-12-13-collapse-repetitive-piping-with-reduce/index.html
index 26c020b..b570fa2 100644
--- a/docs/posts/2020-12-13-collapse-repetitive-piping-with-reduce/index.html
+++ b/docs/posts/2020-12-13-collapse-repetitive-piping-with-reduce/index.html
@@ -2620,7 +2620,7 @@ ${suggestion.title}
diff --git a/docs/posts/2021-01-17-random-sampling-a-table-animation/index.html b/docs/posts/2021-01-17-random-sampling-a-table-animation/index.html
index 65485a5..a465e08 100644
--- a/docs/posts/2021-01-17-random-sampling-a-table-animation/index.html
+++ b/docs/posts/2021-01-17-random-sampling-a-table-animation/index.html
@@ -2614,7 +2614,7 @@ ${suggestion.title}
diff --git a/docs/posts/2021-06-24-setting-up-and-debugging-custom-fonts/index.html b/docs/posts/2021-06-24-setting-up-and-debugging-custom-fonts/index.html
index 48f2651..4dcc777 100644
--- a/docs/posts/2021-06-24-setting-up-and-debugging-custom-fonts/index.html
+++ b/docs/posts/2021-06-24-setting-up-and-debugging-custom-fonts/index.html
@@ -2619,7 +2619,7 @@ ${suggestion.title}
diff --git a/docs/posts/2022-03-10-ggplot2-delayed-aes-1/index.html b/docs/posts/2022-03-10-ggplot2-delayed-aes-1/index.html
index aef1fda..1c428f0 100644
--- a/docs/posts/2022-03-10-ggplot2-delayed-aes-1/index.html
+++ b/docs/posts/2022-03-10-ggplot2-delayed-aes-1/index.html
@@ -2621,7 +2621,7 @@ ${suggestion.title}
diff --git a/docs/posts/2022-07-06-ggplot2-delayed-aes-2/index.html b/docs/posts/2022-07-06-ggplot2-delayed-aes-2/index.html
index 37c17b7..2d78c3c 100644
--- a/docs/posts/2022-07-06-ggplot2-delayed-aes-2/index.html
+++ b/docs/posts/2022-07-06-ggplot2-delayed-aes-2/index.html
@@ -2630,7 +2630,7 @@ ${suggestion.title}
diff --git a/docs/posts/2022-07-30-user2022/index.html b/docs/posts/2022-07-30-user2022/index.html
index 26345fb..8bfbbdd 100644
--- a/docs/posts/2022-07-30-user2022/index.html
+++ b/docs/posts/2022-07-30-user2022/index.html
@@ -2620,7 +2620,7 @@ ${suggestion.title}
diff --git a/docs/posts/2022-11-13-dataframes-jl-and-accessories/index.html b/docs/posts/2022-11-13-dataframes-jl-and-accessories/index.html
index 4c89b5d..eed45ed 100644
--- a/docs/posts/2022-11-13-dataframes-jl-and-accessories/index.html
+++ b/docs/posts/2022-11-13-dataframes-jl-and-accessories/index.html
@@ -2622,7 +2622,7 @@ ${suggestion.title}
diff --git a/docs/posts/2023-06-11-row-relational-operations/index.html b/docs/posts/2023-06-11-row-relational-operations/index.html
index 43af455..2d55264 100644
--- a/docs/posts/2023-06-11-row-relational-operations/index.html
+++ b/docs/posts/2023-06-11-row-relational-operations/index.html
@@ -2626,7 +2626,7 @@ ${suggestion.title}
diff --git a/docs/posts/2023-07-09-x-y-problem/index.html b/docs/posts/2023-07-09-x-y-problem/index.html
index da6cd66..db0994a 100644
--- a/docs/posts/2023-07-09-x-y-problem/index.html
+++ b/docs/posts/2023-07-09-x-y-problem/index.html
@@ -2620,7 +2620,7 @@ ${suggestion.title}
diff --git a/docs/posts/2023-12-03-untidy-select/index.html b/docs/posts/2023-12-03-untidy-select/index.html
index fd64a0c..e71f497 100644
--- a/docs/posts/2023-12-03-untidy-select/index.html
+++ b/docs/posts/2023-12-03-untidy-select/index.html
@@ -2626,7 +2626,7 @@ ${suggestion.title}
diff --git a/docs/posts/2023-12-31-2023-year-in-review/index.html b/docs/posts/2023-12-31-2023-year-in-review/index.html
index 46be6db..043dd52 100644
--- a/docs/posts/2023-12-31-2023-year-in-review/index.html
+++ b/docs/posts/2023-12-31-2023-year-in-review/index.html
@@ -2620,7 +2620,7 @@ ${suggestion.title}
diff --git a/docs/posts/posts.json b/docs/posts/posts.json
index 2456917..877463e 100644
--- a/docs/posts/posts.json
+++ b/docs/posts/posts.json
@@ -15,7 +15,7 @@
],
"contents": "\r\n\r\nContents\r\nIntro\r\nResearch\r\nBlogging\r\nR stuff\r\nPersonal\r\n\r\n\r\n\r\n\r\nFigure 1: New year’s eve celebration fireworks at Long Beach, CA.\r\n\r\n\r\n\r\nIntro\r\nI’ve been seeing a couple folks on Mastodon sharing their “year in review” blog posts, and I thought that was really cool, so I decided to write my own too! I’m mostly documenting for myself but hopefully this also serves as an update of a sort for my friends over the internet since I’ve been pretty silent online this year.\r\nResearch\r\nBeing the Good Grad Student™ I am, I’m forefronting my academia happenings first. In numbers, I published one paper, gave two talks, and presented three posters. I’m not super proud of those numbers: I think they’re a lot less than what people might expect from a 4th year PhD student. But a lot of effort went into each1 and 2023 overall has been a great year for refining and narrowing down on my dissertation topic.2 I did a ton of readings and I hope it pays off for next year when I actually get started on writing the thing.\r\nI already document my research happenings elsewhere and I know that the primarily audience of my blog isn’t linguists, so I won’t expand on that more here.\r\nBlogging\r\n2023 was the year when it became painfully obvious to me that I don’t have much in terms of a portfolio in the sense of the buzzword-y “data science portfolio” that industry recruiters purportedly look for. This ironically coincided with another realization I had, which is that I’m increasingly becoming “the department tech/stats guy” where I take on many small tasks and favors from faculty and other students here and there; I truly do enjoy doing this work, but it’s completely invisible to my CV/resume. I’m still navigating this weird position I’m in, but I’ve found some nice tips3 and at least I still have another year until I’m on the job market to fully figure this out.\r\nThe reason why I put the above rant under the “Blogging” section is because my blog is the closest thing I have a portfolio - there’s not much here, but it’s a public-facing space I own where I get to show people what I know and how I think. So in 2023 I was more conscious about what I blog about and how. The change was subtle - my blog persona is still my usual self, but I’ve tried to diversify the style of my blogs. Whereas I mostly wrote long-form, tutorial-style blog posts in the past, I only wrote one such post (on dplyr::slice()) this year. My other blog posts were one reflecting on how to better answer other people’s questions, and another where I nerd out on the internals of {tidyselect} with little regard for its practicality.4.\r\nAll in all, I wrote three blog posts this year (not including this one). This is the usual rate of publishing blog posts for me, but I hope to write more frequently next year (and write shorter posts overall, and in less formal tone).\r\nR stuff\r\nI didn’t think I’d have much to say about the R stuff I did this year until I sat down to write this blog. Even though this year was the busiest I’ve ever been with research, it turns out that I still ended up doing quite a bit of R stuff in my free time. I’ll cover this chronologically.\r\n\r\nAt the beginning of the year, I was really lucky to receive the student paper award from the Statistical Computing and Graphics section of the ASA, writing about {ggtrace}.5 In the paper, I focused on {ggtrace} as a pedagogical tool for aspiring {ggplot2} extension developers. In the process, I rediscovered the power of reframing ggplot internals as data wrangling and went back to {ggtrace} to add a couple convenience functions for interactive use-cases. After over two years since its inception, {ggtrace} now feels pretty complete in terms of its core features (but suggestions and requests are always welcome!).\r\n\r\nIn Spring, I began writing {jlmerclusterperm}, a statistical package implementing the cluster-based permutation test for time series data, using mixed-effects models. This was a new challenge for me for two reasons. First, I wrote much of the package in Julia - this was my first time writing Julia code for “production” and within an R package.6 Second, I wrote this package for a seminar on eye movements that I was taking that Spring in the psychology department. I wrote {jlmerclusterperm} in an intense burst - most of it was complete by the end of May and I turned in the package as my final.7 I also gave a school-internal talk on it in April; my first time talking about R in front of an entirely academic audience.\r\nIn Summer, I continued polishing {jlmerclusterperm} with another ambitious goal of getting it to CRAN, at the suggestion of a couple researchers who said they’d like to use it for their own research. The already-hard task of getting through my first CRAN submission was compounded by the fact that the package contained Julia code - it took nine resubmissions in the span of two months to finally get {jlmerclusterperm} stably on CRAN.8\r\n\r\n\r\n\r\nFigure 2: Group photo taken at SMLP2023.\r\n\r\n\r\n\r\nAt the beginning of Fall, I attended the Advanced Frequentist stream of the SMLP2023 workshop, taught by Phillip Alday, Reinhold Kliegl and Douglas Bates. The topic was mixed-effects regression models in Julia, one that I became very excited about especially after working on {jlmerclusterperm}. It was an absolute blast and I wish that everyone in linguistics/psychology research appreciated good stats/data analysis as much as the folks I met there. The workshop was far away in Germany (my first time ever in Europe!) and I’m really thankful to MindCORE for giving me a grant to help with travel expenses.\r\n\r\nFor most of Fall, I didn’t do much R stuff, especially with the start of the Fall semester and a big conference looming on the horizon. But the little time I did spend on it, I worked on maintenance and upkeep for {openalexR}, one of my few collaborative projects. It’s also one of the few packages for which I’m an author of that I actually frequently use myself. I used {openalexR} a lot during the Fall semester for conducting literature reviews in preparation for my dissertation proposal, so I had a few opportunities to catch bugs and work on other improvements. I also spent a lot of my time in the Fall TA-ing for an undergraduate data science class that we recently started offering in our department. This was actually my third year in a row TA-ing it, so it went pretty smoothly. I even learned some new quirky R behaviors from my students along the way.\r\n\r\nIn October, I virtually attended the R/Pharma conference and joined a workshop on data validation using the {pointblank} package by Rich Iannone. I had used {pointblank} a little before, but I didn’t explore its features much because I thought it had some odd behaviors that I couldn’t comprehend. The workshop cleared up some of the confusion for me, and Rich made it clear in the workshop that he welcomed contributions to improve the package. So I made a PR addressing the biggest pain point I personally had with using {pointblank}. This turned out to be a pretty big undertaking which took over a month to complete. In the process, I become a co-author of {pointblank}, and I merged a series of PRs that improved the consistency of function designs, among other things.\r\nThe last R thing I did this year was actually secretly Julia - in December I gave a school-internal workshop on fitting mixed effects in Julia, geared towards an academic audience with prior experience in R. I advocated for a middle-ground approach where you can keep doing everything in R and RStudio, except move just the modelling workflow into Julia. I live-coded some Julia code and ran it from RStudio, which I think wasn’t too difficult to grasp.9 I have a half-baked package of addins to make R-Julia interoperability smoother in RStudio; I hope to wrap it up and share it some day.\r\nThat brings me to the present moment, where I’m currently taking a break from FOSS to focus on my research, as my dissertation proposal defense is coming up soon. I will continue to be responsive with maintaining {jlmerclusterperm} during this time (since there’s an active user-base of researchers who find it useful) but my other projects will become low priority. I also don’t think I’ll be starting a new project any time soon, but in the near future I hope I come up with something cool that lets me test-drive {S7}!\r\nPersonal\r\nThis year, I tried to be less of a workaholic. I think I did an okay job at it, and it mostly came in the form of diversifying my hobbies (R used to be my only hobby since starting grad school). I got back into ice skating10 and, briefly, swimming,11 and I’m fortunate that both are available literally two blocks away from my department. My girlfriend and I got really into escape rooms this year, mostly playing online ones due to budget constraints.12 I also got back into playing Steam games13 and racked up over 300 hours on Slay the Spire, mostly from the ~2 weeks recovering from covid in September.14\r\nAnd of course, I have many people to thank for making this a wonderful year.15 Happy new year to all!\r\n\r\nI was the first author for all research that I presented, as is often the case in linguistics.↩︎\r\nBroadly, how kids learn words with overlapping meanings like “dalmatian”<“dog”<“animal” from the language input.↩︎\r\nLike this blog post.↩︎\r\nA style heavily inspired by some of my favorite R bloggers like Matt Dray and Jonathan Carroll↩︎\r\nCoincidentally, my girlfriend also won a student award this year from another ASA - the Acoustical Society of America.↩︎\r\nI can’t recommend {JuliaConnectoR} enough for this.↩︎\r\nI’m actually quite proud of myself for pulling this off - writing an R package for the final was unprecedented for the class.↩︎\r\nIn the process, I received the elusive CRAN Note for exceeding 6 updates in under a month (CRAN recommends one update every 1-2 months).↩︎\r\nUsing some tricks described in the workshop materials.↩︎\r\nI used to play ice hockey competitively as a kid.↩︎\r\nTurns out that swimming does not play well with my preexisting ear conditions.↩︎\r\nMost recently we played Hallows Hill which I think is the best one we’ve played so far↩︎\r\nI’m very into roguelike genres but haven’t really played video games since high school.↩︎\r\nFor the fellow nerds, I reached A20 on Ironclad, Defect, and Watcher. I’m working my way up for Silent.↩︎\r\nI’m feeling shy so this goes in the footnotes. In roughly chronological order, I’m firstly indebted to Sam Tyner-Monroe who encouraged me write up {ggtrace} for the ASA paper award after my rstudio::conf talk on it last year. I’m grateful to Gina Reynolds and Teun van den Brand (and others in the ggplot extension club) for engaging in many insightful data viz/ggplot internals discussions with me. I’m also grateful to my FOSS collaborators, especially Trang Le, from whom I’ve learned a lot about code review and package design principles while working on {openalexR} together. Last but not least, I owe a lot to Daniel Sjoberg and Shannon Pileggi for a recent development that I’m not ready to publicly share yet 🤫.↩︎\r\n",
"preview": "posts/2023-12-31-2023-year-in-review/preview.png",
- "last_modified": "2024-01-01T12:43:40-08:00",
+ "last_modified": "2024-01-01T15:43:40-05:00",
"input_file": {},
"preview_width": 1512,
"preview_height": 1371
@@ -38,7 +38,7 @@
],
"contents": "\r\n\r\nContents\r\nIntro\r\nSome observations\r\ntidy-select!\r\ntidy?-select\r\nuntidy-select?\r\nuntidy-select!\r\n\r\nTidying untidy-select\r\nWriting untidy-select helpers\r\n1) times()\r\n2) offset()\r\n3) neighbors()\r\nDIY!\r\n\r\nLet’s get practical\r\n1) Sorting columns\r\n2) Error handling\r\n\r\nConclusion\r\n\r\nIntro\r\nRecently, I’ve been having frequent run-ins with {tidyselect} internals, discovering some weird and interesting behaviors along the way. This blog post is my attempt at documenting a couple of these. And as is the case with my usual style of writing, I’m gonna talk about some of the weirder stuff first and then touch on some of the “practical” side to this.\r\nSome observations\r\nLet’s start with some facts about how {tidyselect} is supposed to work. I’ll use this toy data for the demo:\r\n\r\n\r\nlibrary(dplyr, warn.conflicts = FALSE)\r\nlibrary(tidyselect)\r\ndf <- tibble(x = 1:2, y = letters[1:2], z = LETTERS[1:2])\r\ndf\r\n\r\n # A tibble: 2 × 3\r\n x y z \r\n \r\n 1 1 a A \r\n 2 2 b B\r\n\r\ntidy-select!\r\n{tidyselect} is the package that powers dplyr::select(). If you’ve used {dplyr}, you already know the behavior of select() pretty well. We can specify a column as string, symbol, or by its position:\r\n\r\n\r\ndf %>% \r\n select(\"x\")\r\n\r\n # A tibble: 2 × 1\r\n x\r\n \r\n 1 1\r\n 2 2\r\n\r\ndf %>% \r\n select(x)\r\n\r\n # A tibble: 2 × 1\r\n x\r\n \r\n 1 1\r\n 2 2\r\n\r\ndf %>% \r\n select(1)\r\n\r\n # A tibble: 2 × 1\r\n x\r\n \r\n 1 1\r\n 2 2\r\n\r\nIt’s not obvious from the outside, but the way this works is that these user-supplied expressions (like \"x\", x, and 1) all get resolved to integer before the selection actually happens.\r\nSo to be more specific, the three calls to select() were the same because these three calls to tidyselect::eval_select() are the same:1\r\n\r\n\r\neval_select(quote(\"x\"), df)\r\n\r\n x \r\n 1\r\n\r\neval_select(quote(x), df)\r\n\r\n x \r\n 1\r\n\r\neval_select(quote(1), df)\r\n\r\n x \r\n 1\r\n\r\nYou can also see eval_select() in action in the method for select():\r\n\r\n\r\ndplyr:::select.data.frame\r\n\r\n function (.data, ...) \r\n {\r\n error_call <- dplyr_error_call()\r\n loc <- tidyselect::eval_select(expr(c(...)), data = .data, \r\n error_call = error_call)\r\n loc <- ensure_group_vars(loc, .data, notify = TRUE)\r\n out <- dplyr_col_select(.data, loc)\r\n out <- set_names(out, names(loc))\r\n out\r\n }\r\n \r\n \r\n\r\ntidy?-select\r\nBecause the column subsetting part is ultimately done using integers, we can theoretically pass select() any expression, as long as it resolves to an integer vector.\r\nFor example, we can use 1 + 1 to select the second column:\r\n\r\n\r\ndf %>% \r\n select(1 + 1)\r\n\r\n # A tibble: 2 × 1\r\n y \r\n \r\n 1 a \r\n 2 b\r\n\r\nAnd vector recycling is still a thing here too - we can use c(1, 2) + 1 to select the second and third columns:\r\n\r\n\r\ndf %>% \r\n select(c(1, 2) + 1)\r\n\r\n # A tibble: 2 × 2\r\n y z \r\n \r\n 1 a A \r\n 2 b B\r\n\r\nOrdinary function calls work as well - we can select a random column using sample():\r\n\r\n\r\ndf %>% \r\n select(sample(ncol(df), 1))\r\n\r\n # A tibble: 2 × 1\r\n y \r\n \r\n 1 a \r\n 2 b\r\n\r\nWe can even use the .env pronoun to scope an integer variable from the global environment:2\r\n\r\n\r\noffset <- 1\r\ndf %>% \r\n select(1 + .env$offset)\r\n\r\n # A tibble: 2 × 1\r\n y \r\n \r\n 1 a \r\n 2 b\r\n\r\nSo that’s kinda interesting.3 But what if we try to mix the different approaches to tidyselect-ing? Can we do math on columns that we’ve selected using strings and symbols?\r\nuntidy-select?\r\nUh not quite. select() doesn’t like doing math on strings and symbols.\r\n\r\n\r\ndf %>% \r\n select(x + 1)\r\n\r\n Error in `select()`:\r\n ! Problem while evaluating `x + 1`.\r\n Caused by error:\r\n ! object 'x' not found\r\n\r\ndf %>% \r\n select(\"x\" + 1)\r\n\r\n Error in `select()`:\r\n ! Problem while evaluating `\"x\" + 1`.\r\n Caused by error in `\"x\" + 1`:\r\n ! non-numeric argument to binary operator\r\n\r\nIn fact, it doesn’t even like doing certain kinds of math like multiplication (*), even with numeric constants:\r\n\r\n\r\ndf %>% \r\n select(1 * 2)\r\n\r\n Error in `select()`:\r\n ! Can't use arithmetic operator `*` in selection context.\r\n\r\nThis actually makes sense from a design POV. Adding numbers to columns probably happens more often as a mistake than something intentional. These safeguards exist to prevent users from running into cryptic errors.\r\nUnless…\r\nuntidy-select!\r\nIt turns out that {tidyselect} helpers have an interesting behavior of immediately resolving the column selection to integer. So we can get addition (+) working if we wrap our columns in redundant column selection helpers like all_of() and matches()\r\n\r\n\r\ndf %>% \r\n select(all_of(\"x\") + 1)\r\n\r\n # A tibble: 2 × 1\r\n y \r\n \r\n 1 a \r\n 2 b\r\n\r\ndf %>% \r\n select(matches(\"^x$\") + 1)\r\n\r\n # A tibble: 2 × 1\r\n y \r\n \r\n 1 a \r\n 2 b\r\n\r\nFor multiplication, we have to additionally circumvent the censoring of the * symbol. Here, we can simply use a different name for the same operation:4\r\n\r\n\r\n`%times%` <- `*`\r\ndf %>% \r\n select(matches(\"^x$\") %times% 2)\r\n\r\n # A tibble: 2 × 1\r\n y \r\n \r\n 1 a \r\n 2 b\r\n\r\nBut geez, it’s so tiring to type all_of() and matches() all the time. There must be a better way to break the rule!\r\nTidying untidy-select\r\nLet’s make a tidy design for the untidy pattern of selecting columns by doing math on column locations. The idea is to make our own little scope inside select() where all the existing safeguards are suspended. Like a DSL within a DSL, if you will.\r\nLet’s call this function math(). It should let us express stuff like “give me the column to the right of column x” via this intuitive(?) syntax:\r\n\r\n\r\n\r\n\r\n\r\ndf %>% \r\n select(math(x + 1))\r\n\r\n # A tibble: 2 × 1\r\n y \r\n \r\n 1 a \r\n 2 b\r\n\r\nThis is my take on math():\r\n\r\n\r\nmath <- function(expr) {\r\n math_expr <- rlang::enquo(expr)\r\n columns <- tidyselect::peek_vars()\r\n col_locs <- as.data.frame.list(seq_along(columns), col.names = columns)\r\n mask <- rlang::as_data_mask(col_locs)\r\n out <- rlang::eval_tidy(math_expr, mask)\r\n out\r\n}\r\n\r\n\r\nThere’s a lot of weird functions involved here, but it’s easier to digest by focusing on its parts. Here’s what each local variable in the function looks like for our math(x + 1) example above:\r\n\r\n $math_expr\r\n \r\n expr: ^x + 1\r\n env: 0x0000012f8e27cec8\r\n \r\n $columns\r\n [1] \"x\" \"y\" \"z\"\r\n \r\n $col_locs\r\n x y z\r\n 1 1 2 3\r\n \r\n $mask\r\n \r\n \r\n $out\r\n [1] 2\r\n\r\nLet’s walk through the pieces:\r\nmath_expr: the captured user expression, with the environment attached\r\ncolumns: the column names of the current dataframe, in order\r\ncol_locs: a dataframe of column names and location, created from columns\r\nmask: a data mask created from col_locs\r\nout: location of column(s) to select\r\nEssentially, math() first captures the expression to evaluate it in its own special environment, circumventing select()’s safeguards. Then, it grabs the column names of the data frame with tidyselect::peek_vars() to define col_locs and then mask. The data mask mask is then used inside rlang::eval_tidy() to resolve symbols like x to integer 1 when evaluating the captured expression x + 1. The expression math(x + 1) thus evaluates to 1 + 1. In turn, select(math(x + 1)) is evaluated to select(2), returning us the second column of the dataframe.\r\nWriting untidy-select helpers\r\nA small yet powerful detail in the implementation of math() is the fact that it captures the expression as a quosure. This allows math() to appropriately scope dynamically created variables, and not just bare symbols provided directly by the user.\r\nThis makes more sense with some examples. Here, I define helper functions that call math() under the hood with their own templatic math expressions (and I have them print() the expression as passed to math() for clarity). The fact that math() captures its argument as a quosure is what allows local variables like n to be correctly scoped in these examples.\r\n1) times()\r\n\r\n\r\ntimes <- function(col, n) {\r\n col <- rlang::ensym(col)\r\n print(rlang::expr(math(!!col * n))) # for debugging\r\n math(!!col * n)\r\n}\r\ndf %>%\r\n select(times(x, 2))\r\n\r\n math(x * n)\r\n # A tibble: 2 × 1\r\n y \r\n \r\n 1 a \r\n 2 b\r\n\r\n\r\n\r\nnum2 <- 2\r\ndf %>%\r\n select(times(x, num2))\r\n\r\n math(x * n)\r\n # A tibble: 2 × 1\r\n y \r\n \r\n 1 a \r\n 2 b\r\n\r\n2) offset()\r\n\r\n\r\noffset <- function(col, n) {\r\n col <- rlang::ensym(col)\r\n print(rlang::expr(math(!!col + n))) # for debugging\r\n math(!!col + n)\r\n}\r\ndf %>%\r\n select(offset(x, 1))\r\n\r\n math(x + n)\r\n # A tibble: 2 × 1\r\n y \r\n \r\n 1 a \r\n 2 b\r\n\r\n\r\n\r\nnum1 <- 1\r\ndf %>%\r\n select(offset(x, num1))\r\n\r\n math(x + n)\r\n # A tibble: 2 × 1\r\n y \r\n \r\n 1 a \r\n 2 b\r\n\r\n3) neighbors()\r\n\r\n\r\nneighbors <- function(col, n) {\r\n col <- rlang::ensym(col)\r\n range <- c(-(n:1), 1:n)\r\n print(rlang::expr(math(!!col + !!range))) # for debugging\r\n math(!!col + !!range)\r\n}\r\ndf %>%\r\n select(neighbors(y, 1))\r\n\r\n math(y + c(-1L, 1L))\r\n # A tibble: 2 × 2\r\n x z \r\n \r\n 1 1 A \r\n 2 2 B\r\n\r\n\r\n\r\ndf %>%\r\n select(neighbors(y, num1))\r\n\r\n math(y + c(-1L, 1L))\r\n # A tibble: 2 × 2\r\n x z \r\n \r\n 1 1 A \r\n 2 2 B\r\n\r\nDIY!\r\nAnd of course, we can do arbitrary injections ourselves as well with !! or .env$:\r\n\r\n\r\ndf %>%\r\n select(math(x * !!num2))\r\n\r\n # A tibble: 2 × 1\r\n y \r\n \r\n 1 a \r\n 2 b\r\n\r\ndf %>%\r\n select(math(x * .env$num2))\r\n\r\n # A tibble: 2 × 1\r\n y \r\n \r\n 1 a \r\n 2 b\r\n\r\nThat was fun but probably not super practical. Let’s set math() aside to try our hands on something more useful.\r\nLet’s get practical\r\n1) Sorting columns\r\nProbably one of the hardest things to do idiomatically in the tidyverse is sorting (a subset of) columns by their name. For example, consider this dataframe which is a mix of columns that follow some fixed pattern (\"x|y_\\\\d\") and those outside that pattern (\"year\", \"day\", etc.).\r\n\r\n\r\ndata_cols <- expand.grid(first = c(\"x\", \"y\"), second = 1:3) %>%\r\n mutate(cols = paste0(first, \"_\", second)) %>%\r\n pull(cols)\r\ndf2 <- as.data.frame.list(seq_along(data_cols), col.names = data_cols)\r\ndf2 <- cbind(df2, storms[1,1:5])\r\ndf2 <- df2[, sample(ncol(df2))]\r\ndf2\r\n\r\n y_3 x_3 month day hour y_2 y_1 x_2 year name x_1\r\n 1 6 5 6 27 0 4 2 3 1975 Amy 1\r\n\r\nIt’s trivial to select columns by pattern - we can use the matches() helper:\r\n\r\n\r\ndf2 %>%\r\n select(matches(\"(x|y)_(\\\\d)\"))\r\n\r\n y_3 x_3 y_2 y_1 x_2 x_1\r\n 1 6 5 4 2 3 1\r\n\r\nBut what if I also wanted to further sort these columns, after I select them? There’s no easy way to do this “on the fly” inside of select, especially if we want the flexibility to sort the columns by the letter vs. the number.\r\nBut here’s one way of getting at that, exploiting two facts:\r\nmatches(), like other tidyselect helpers, immediately resolves the selection to integer\r\npeek_vars() returns the column names in order, which lets us recover the column names from location\r\nAnd that’s pretty much all there is to the tidyselect magic that goes into my solution below. I define locs (integer vector of column locations) and cols (character vector of column names at those locations), and the rest is just regex and sorting:\r\n\r\n\r\nordered_matches <- function(matches, order) {\r\n # tidyselect magic\r\n locs <- tidyselect::matches(matches)\r\n cols <- tidyselect::peek_vars()[locs]\r\n # Ordinary evaluation\r\n groups <- simplify2array(regmatches(cols, regexec(matches, cols)))[-1,]\r\n reordered <- do.call(\"order\", asplit(groups[order, ], 1))\r\n locs[reordered]\r\n}\r\n\r\n\r\nUsing ordered_matches(), we can not only select columns but also sort them using regex capture groups.\r\nThis sorts the columns by letter first then number:\r\n\r\n\r\ndf2 %>%\r\n select(ordered_matches(\"(x|y)_(\\\\d)\", c(1, 2)))\r\n\r\n x_1 x_2 x_3 y_1 y_2 y_3\r\n 1 1 3 5 2 4 6\r\n\r\nThis sorts the columns by number first then letter:\r\n\r\n\r\ndf2 %>%\r\n select(ordered_matches(\"(x|y)_(\\\\d)\", c(2, 1)))\r\n\r\n x_1 y_1 x_2 y_2 x_3 y_3\r\n 1 1 2 3 4 5 6\r\n\r\nAnd if we wanted the other columns too, we can use everything() to grab the “rest”:\r\n\r\n\r\ndf2 %>%\r\n select(ordered_matches(\"(x|y)_(\\\\d)\", c(2, 1)), everything())\r\n\r\n x_1 y_1 x_2 y_2 x_3 y_3 month day hour year name\r\n 1 1 2 3 4 5 6 6 27 0 1975 Amy\r\n\r\n2) Error handling\r\nOne of the really nice parts about the {tidyselect} design is the fact that error messages are very informative.\r\nFor example, if you select a non-existing column, it errors while pointing out that mistake:\r\n\r\n\r\ndf3 <- data.frame(x = 1)\r\nnonexistent_selection <- quote(c(x, y))\r\neval_select(nonexistent_selection, df3)\r\n\r\n Error:\r\n ! Can't subset columns that don't exist.\r\n ✖ Column `y` doesn't exist.\r\n\r\nIf you use a tidyselect helper that returns nothing, it won’t complain by default:\r\n\r\n\r\nzero_selection <- quote(starts_with(\"z\"))\r\neval_select(zero_selection, df3)\r\n\r\n named integer(0)\r\n\r\nBut you can make that error with allow_empty = FALSE:\r\n\r\n\r\neval_select(zero_selection, df3, allow_empty = FALSE)\r\n\r\n Error:\r\n ! Must select at least one item.\r\n\r\nGeneral evaluation errors are caught and chained:\r\n\r\n\r\nevaluation_error <- quote(stop(\"I'm a bad expression!\"))\r\neval_select(evaluation_error, df3)\r\n\r\n Error:\r\n ! Problem while evaluating `stop(\"I'm a bad expression!\")`.\r\n Caused by error:\r\n ! I'm a bad expression!\r\n\r\nThese error signalling patterns are clearly very useful for users,5 but there’s a little gem in there for developers too. It turns out that the error condition object contains these information too, which lets you detect different error types programmatically to forward errors to your own error handling logic.\r\nFor example, the attempted non-existent column is stored in $i:6\r\n\r\n\r\ncnd_nonexistent <- rlang::catch_cnd(\r\n eval_select(nonexistent_selection, df3)\r\n)\r\ncnd_nonexistent$i\r\n\r\n [1] \"y\"\r\n\r\nZero column selections give you NULL in $i when you set it to error:\r\n\r\n\r\ncnd_zero_selection <- rlang::catch_cnd(\r\n eval_select(zero_selection, df3, allow_empty = FALSE)\r\n)\r\ncnd_zero_selection$i\r\n\r\n NULL\r\n\r\nGeneral evaluation errors are distinguished by having a $parent:\r\n\r\n\r\ncnd_evaluation_error <- rlang::catch_cnd(\r\n eval_select(evaluation_error, df3)\r\n)\r\ncnd_evaluation_error$parent\r\n\r\n \r\n\r\nAgain, this is more useful as a developer, if you’re building something that integrates {tidyselect}.7 But I personally find this interesting to know about anyways!\r\nConclusion\r\nHere I end with the (usual) disclaimer to not actually just copy paste these for production - they’re written with the very low standard of scratching my itch, so they do not come with any warranty!\r\nBut I hope that this was a fun exercise in thinking through one of the most mysterious magics in {dplyr}. I’m sure to reference this many times in the future myself.\r\n\r\nThe examples quote(\"x\") and quote(1) are redundant because \"x\" and 1 are constants. I keep quote() in there just to make the comparison clearer↩︎\r\nNot to be confused with all_of(). The idiomatic pattern for scoping an external character vector is to do all_of(x) not .env$x. It’s only when you’re scoping a non-character vector that you’d use .env$.↩︎\r\nIt’s also strangely reminiscent of my previous blog post on dplyr::slice()↩︎\r\nThanks to Jonathan Carroll for this suggestion!↩︎\r\nFor those who actually read error messages, at least (points to self) …↩︎\r\nThough {tidyselect} errors early, so it’ll only record the first attempted column causing the error. You could use a while() loop (catch and remove bad columns from the data until there’s no more error) if you really wanted to get the full set of offending columns.↩︎\r\nIf you want some examples of post-processing tidyselect errors, there’s some stuff I did for pointblank that may be helpful as a reference.↩︎\r\n",
"preview": "posts/2023-12-03-untidy-select/preview.png",
- "last_modified": "2023-12-04T07:11:22-08:00",
+ "last_modified": "2023-12-04T10:11:22-05:00",
"input_file": {},
"preview_width": 957,
"preview_height": 664
@@ -59,7 +59,7 @@
],
"contents": "\r\n\r\nContents\r\nIntro\r\nWhat is an XY problem?\r\nThe question\r\nAttempt 1: after_stat()? I know that!\r\nAttempt 2: Hmm but why not after_scale()?\r\nAttempt 3: Oh. You just wanted a scale_fill_*()…\r\nReflections\r\nEnding on a fun aside - accidentally escaping an XY problem\r\n\r\nIntro\r\nA few months ago, over at the R4DS slack (http://r4ds.io/join), someone posted a ggplot question that was within my area of “expertise”. I got tagged in the thread, I went in, and it took me 3 tries to arrive at the correct solution that the poster was asking for.\r\nThe embarrassing part of the exchange was that I would write one solution, think about what I wrote for a bit, and then write a different solution after realizing that I had misunderstood the intent of the original question. In other words, I was consistently missing the point.\r\nThis is a microcosm of a bigger problem of mine that I’ve been noticing lately, as my role in the R community has shifted from mostly asking questions to mostly answering questions. By this point I’ve sort of pin-pointed the problem: I have a hard time recognizing that I’m stuck in an XY problem.\r\nI have a lot of thoughts on this and I want to document them for future me,1 so here goes a rant. I hope it’s useful to whoever is reading this too.\r\nWhat is an XY problem?\r\nAccording to Wikipedia:\r\n\r\nThe XY problem is a communication problem… where the question is about an end user’s attempted solution (Y) rather than the root problem itself (X).\r\n\r\nThe classic example of this is when a (novice) user asks how to extract the last 3 characters in a filename. There’s no good reason to blindly grab the last 3 characters, so what they probably meant to ask is how to get the file extension (which is not always 3 characters long, like .R or .Rproj).2\r\nAnother somewhat related cult-classic, copypasta3 example is the “Don’t use regex to parse HTML” answer on stackoverflow. Here, a user asks how to use regular expressions to match HTML tags, to which the top-voted answer is don’t (instead, you should use a dedicated parser). The delivery of this answer is a work of art, so I highly suggest you giving it a read if you haven’t seen it already (the link is above for your amusement).\r\nAn example of an XY problem in R that might hit closer to home is when a user complains about the notorious Object of type 'closure' is not subsettable error. It’s often brought up as a cautionary tale for novice users (error messages can only tell you so much, so you must develop debugging strategies), but it has a special meaning for more experienced users who’ve been bit by this multiple times. So for me, when I see novice users reporting this specific error, I usually ask them if they have a variable called data and whether they forgot to run the line assigning that variable. Of course, this answer does not explain what the error means,4 but oftentimes it’s the solution that the user is looking for.\r\n\r\n\r\n# Oops forgot to define `data`!\r\n# `data` is a function (in {base}), which is not subsettable\r\ndata$value\r\n\r\n Error in data$value: object of type 'closure' is not subsettable\r\n\r\nAs one last example, check out this lengthy exchange on splitting a string (Y) to parse JSON (X). I felt compelled to include this example because it does a good job capturing the degree of frustration (very high) that normally comes with XY problems.\r\nBut the thing about the XY problem is that it often prompts the lesson of asking good questions: don’t skip steps in your reasoning, make your goals/intentions clear, use a reprex,5 and so on. But in so far as it’s a communication problem involving both parties, I think we should also talk about what the person answering the question can do to recognize an XY problem and break out of it.\r\nEnter me, someone who really needs to do a better job of recognizing when I’m stuck in an XY problem. So with the definition out of the way, let’s break down how I messed up.\r\nThe question\r\nThe question asks:\r\n\r\nDoes anyone know how to access the number of bars in a barplot? I’m looking for something that will return “15” for the following code, that can be used within ggplot, like after_stat()\r\n\r\nThe question comes with an example code. Not exactly a reprex, but something to help understand the question:\r\n\r\n\r\np <- ggplot(mpg, aes(manufacturer, fill = manufacturer)) +\r\n geom_bar()\r\np\r\n\r\n\r\n\r\nThe key phrase in the question is “can be used within ggplot”. So the user isn’t looking for something like this even though it’s conceptually equivalent:\r\n\r\n\r\nlength(unique(mpg$manufacturer))\r\n\r\n [1] 15\r\n\r\nThe idea here is that ggplot knows that there are 15 bars, so this fact must represented somewhere in the internals. The user wants to be able to access that value dynamically.\r\nAttempt 1: after_stat()? I know that!\r\nThe very last part of the question “… like after_stat()” triggered some alarms in the thread and got me called in. For those unfamiliar, after_stat() is part of the new and obscure family of delayed aesthetic evaluation functions introduced in ggplot 3.3.0. It’s something that you normally don’t think about in ggplot, but it’s a topic that I’ve been obsessed with for the last 2 years or so: it has resulted in a paper, a package (ggtrace), blog posts, and talks (useR!, rstudio::conf, JSM).\r\nThe user asked about after_stat(), so naturally I came up with an after_stat() solution. In the after-stat stage of the bar layer’s data, the layer data looks like this:\r\n\r\n\r\n# remotes::install_github(\"yjunechoe/ggtrace\")\r\nlibrary(ggtrace)\r\n# Grab the state of the layer data in the after-stat\r\nlayer_after_stat(p)\r\n\r\n # A tibble: 15 × 8\r\n count prop x width flipped_aes fill PANEL group\r\n \r\n 1 18 1 1 0.9 FALSE audi 1 1\r\n 2 19 1 2 0.9 FALSE chevrolet 1 2\r\n 3 37 1 3 0.9 FALSE dodge 1 3\r\n 4 25 1 4 0.9 FALSE ford 1 4\r\n 5 9 1 5 0.9 FALSE honda 1 5\r\n 6 14 1 6 0.9 FALSE hyundai 1 6\r\n 7 8 1 7 0.9 FALSE jeep 1 7\r\n 8 4 1 8 0.9 FALSE land rover 1 8\r\n 9 3 1 9 0.9 FALSE lincoln 1 9\r\n 10 4 1 10 0.9 FALSE mercury 1 10\r\n 11 13 1 11 0.9 FALSE nissan 1 11\r\n 12 5 1 12 0.9 FALSE pontiac 1 12\r\n 13 14 1 13 0.9 FALSE subaru 1 13\r\n 14 34 1 14 0.9 FALSE toyota 1 14\r\n 15 27 1 15 0.9 FALSE volkswagen 1 15\r\n\r\nIt’s a tidy data where each row represents a barplot. So the number of bars is the length of any column in the after-stat data, but it’d be most principled to take the length of the group column in this case.6\r\nSo the after-stat expression that returns the desired value 15 is after_stat(length(group)), which essentially evaluates to the following:\r\n\r\n\r\nlength(layer_after_stat(p)$group)\r\n\r\n [1] 15\r\n\r\nFor example, you can use this inside the aes() to annotate the total number of bars on top of each bar:\r\n\r\n\r\nggplot(mpg, aes(manufacturer, fill = manufacturer)) +\r\n geom_bar() +\r\n geom_label(\r\n aes(label = after_stat(length(group))),\r\n fill = \"white\",\r\n stat = \"count\"\r\n )\r\n\r\n\r\n\r\nThe after_stat(length(group)) solution returns the number of bars using after_stat(), as the user asked. But as you can see this is extremely useless: there are many technical constraints on what you can actually do with this information in the after-stat stage.\r\nI should have checked if they actually wanted an after_stat() solution first, before providing this answer. But I got distracted by the after_stat() keyword and got too excited by the prospect of someone else taking interest in the thing that I’m obsessed with. Alas this wasn’t the case - they were trying to do something practical - so I went back into the thread to figure out their goal for my second attempt.\r\nAttempt 2: Hmm but why not after_scale()?\r\nWhat I had neglected in my first attempt was the fact that the user talked more about their problem with someone else who got to the question before I did. That discussion turned out to include an important clue to the intent behind the original question: the user wanted the number of bars in order to interpolate the color of the bars.\r\nSo for example, a palette function like topo.colors() takes n to produce interpolated color values:\r\n\r\n\r\ntopo.colors(n = 16)\r\n\r\n [1] \"#4C00FF\" \"#0F00FF\" \"#002EFF\" \"#006BFF\" \"#00A8FF\" \"#00E5FF\" \"#00FF4D\"\r\n [8] \"#00FF00\" \"#4DFF00\" \"#99FF00\" \"#E6FF00\" \"#FFFF00\" \"#FFEA2D\" \"#FFDE59\"\r\n [15] \"#FFDB86\" \"#FFE0B3\"\r\n\r\nchroma::show_col(topo.colors(16))\r\n\r\n\r\n\r\nIf the intent is to use the number of bars to generate a vector of colors to assign to the bars, then a better place to do it would be in the after_scale(), where the state of the layer data in the after-scale looks like this:\r\n\r\n\r\nlayer_after_scale(p)\r\n\r\n # A tibble: 15 × 16\r\n fill y count prop x flipped_aes PANEL group ymin ymax xmin xmax \r\n \r\n 1 #F87… 18 18 1 1 FALSE 1 1 0 18 0.55 1.45\r\n 2 #E58… 19 19 1 2 FALSE 1 2 0 19 1.55 2.45\r\n 3 #C99… 37 37 1 3 FALSE 1 3 0 37 2.55 3.45\r\n 4 #A3A… 25 25 1 4 FALSE 1 4 0 25 3.55 4.45\r\n 5 #6BB… 9 9 1 5 FALSE 1 5 0 9 4.55 5.45\r\n 6 #00B… 14 14 1 6 FALSE 1 6 0 14 5.55 6.45\r\n 7 #00B… 8 8 1 7 FALSE 1 7 0 8 6.55 7.45\r\n 8 #00C… 4 4 1 8 FALSE 1 8 0 4 7.55 8.45\r\n 9 #00B… 3 3 1 9 FALSE 1 9 0 3 8.55 9.45\r\n 10 #00B… 4 4 1 10 FALSE 1 10 0 4 9.55 10.45\r\n 11 #619… 13 13 1 11 FALSE 1 11 0 13 10.55 11.45\r\n 12 #B98… 5 5 1 12 FALSE 1 12 0 5 11.55 12.45\r\n 13 #E76… 14 14 1 13 FALSE 1 13 0 14 12.55 13.45\r\n 14 #FD6… 34 34 1 14 FALSE 1 14 0 34 13.55 14.45\r\n 15 #FF6… 27 27 1 15 FALSE 1 15 0 27 14.55 15.45\r\n # ℹ 4 more variables: colour , linewidth , linetype ,\r\n # alpha \r\n\r\nIt’s still a tidy data where each row represents a bar. But the important distinction between the after-stat and the after-scale is that the after-scale data reflects the work of the (non-positional) scales. So the fill column here is now the actual hexadecimal color values for the bars:\r\n\r\n\r\nlayer_after_scale(p)$fill\r\n\r\n [1] \"#F8766D\" \"#E58700\" \"#C99800\" \"#A3A500\" \"#6BB100\" \"#00BA38\" \"#00BF7D\"\r\n [8] \"#00C0AF\" \"#00BCD8\" \"#00B0F6\" \"#619CFF\" \"#B983FF\" \"#E76BF3\" \"#FD61D1\"\r\n [15] \"#FF67A4\"\r\n\r\nchroma::show_col(layer_after_scale(p)$fill)\r\n\r\n\r\n\r\nWhat after_scale()/stage(after_scale = ) allows you to do is override these color values right before the layer data is sent off to be drawn. So we again use the same expression length(group) to grab the number of bars in the after-scale data, pass that value to a color palette function like topo.colors(), and re-map to the fill aesthetic.\r\n\r\n\r\nggplot(mpg, aes(manufacturer)) +\r\n geom_bar(aes(fill = stage(manufacturer, after_scale = topo.colors(length(group))))) +\r\n scale_fill_identity()\r\n\r\n\r\n\r\nSo this solution achieves the desired effect, but it’s needlessly complicated. You need complex staging of the fill aesthetic via stage() and you also need to pair this with scale_fill_identity() to let ggplot know that you’re directly supplying the fill values (otherwise you get errors and warnings).\r\nWait hold up - a fill scale? Did this user actually just want a custom fill scale? Ohhh…\r\nAttempt 3: Oh. You just wanted a scale_fill_*()…\r\nSo yeah. It turns out that they just wanted a custom scale that takes some set of colors and interpolate the colors across the bars in the plot.\r\nThe correct way to approach this problem is to create a new fill scale that wraps around discrete_scale(). The scale function should take a set of colors (cols) and pass discrete_scale() a palette function created via the function factory colorRampPalette().\r\n\r\n\r\nscale_fill_interpolate <- function(cols, ...) {\r\n discrete_scale(\r\n aesthetics = \"fill\",\r\n scale_name = \"interpolate\",\r\n palette = colorRampPalette(cols),\r\n ...\r\n )\r\n}\r\n\r\n\r\nOur new scale_fill_interpolate() function can now be added to the plot like any other scale:\r\n\r\n\r\np +\r\n scale_fill_interpolate(c(\"pink\", \"goldenrod\"))\r\n\r\n\r\n\r\n\r\n\r\np +\r\n scale_fill_interpolate(c(\"steelblue\", \"orange\", \"forestgreen\"))\r\n\r\n\r\n\r\n\r\n\r\nset.seed(123)\r\ncols <- sample(colors(), 5)\r\ncols\r\n\r\n [1] \"lightgoldenrodyellow\" \"mediumorchid1\" \"gray26\" \r\n [4] \"palevioletred2\" \"gray42\"\r\n\r\np +\r\n scale_fill_interpolate(cols)\r\n\r\n\r\n\r\nI sent (a variant of) this answer to the thread and the user marked it solved with a thanks, concluding my desperate spiral into finding the right solution to the intended question.\r\nReflections\r\nSo why was this so hard for me to get? The most immediate cause is because I quickly skimmed the wording of the question and extracted two key phrases:\r\n“access the number of bars in a barplot”\r\n“that can be used within ggplot, like after_stat()”\r\nBut neither of these turned out to be important (or even relevant) to the solution. The correct answer was just a clean custom fill scale, where you don’t have to think about the number of bars or accessing that in the internals. Simply extending discrete_scale() allows you to abstract away from those details entirely.\r\nSo in fairness, it was a very difficult XY problem to get out of. But the wording of the question wasn’t the root cause. I think the root cause is some combination of the following:\r\nThere are many ways to do the same thing in R so I automatically assume that my solution counts as a contribution as long as it gets the job done. But solutions should also be understandable for the person asking the question. Looking back, I was insane to even suggest my second attempt as the solution because it’s so contrived and borderline incomprehensible. It only sets the user up for more confusion and bugs in the future, so that was a bit irresponsible and selfish of me (it only scratches my itch).\r\nSolutions to (practical) problems are usually boring and I’m allergic to boring solutions. This is a bad attitude to have when offering to help people. I assumed that people share my excitement about ggplot internals, but actually most users don’t care (that’s why it’s called the internals and hidden from users). An important context that I miss as the person answering questions on the other end is that users post questions when they’re stuck and frustrated. Their goal is not to take a hard problem and turn it into a thinking exercise or a learning experience (that part happens organically, but is not the goal). If anything, that’s what I’m doing when I choose to take interest in other people’s (coding) problems.\r\nI imbue intent to questions that are clearing missing it. I don’t think that’s a categorically bad thing because it can sometimes land you in a shortcut out of an XY problem. But when you miss, it’s catastrophic and pulls you deeper into the problem. I think that was the case for me here - I conflated the X with the Y and assumed that after_stat() was relevant on face value because I personally know it to be a very powerful tool. I let my own history of treating after_stat() like the X (“How can I use after_stat() to solve/simplify this problem?”) guide my interpretation of the question, which is not good practice.\r\nOf course, there are likely more to this, but these are plenty for me to work on for now.\r\nLastly, I don’t want this to detract from the fact that the onus is on users to ask good questions. I don’t want to put question-answer-ers on the spot for their handling of XY problems. After all, most are volunteers who gain nothing from helping others besides status and some internet points.7 Just take this as me telling myself to be a better person.\r\nEnding on a fun aside - accidentally escaping an XY problem\r\nIt’s not my style to write serious blog posts. I think I deserve a break from many paragraphs of self-induced beat down.\r\nSo in that spirit I want to end on a funny anecdote where I escaped an XY problem by pure luck.\r\nI came across a relatively straightforward question which can be summarized as the following:\r\n\r\n\r\ninput <- \"a + c + d + e\"\r\noutput <- c(\"a\", \"c\", \"d\", \"e\")\r\n\r\n\r\nThere are many valid approaches to this and some were already posted to the thread:\r\n\r\n\r\nstrsplit(input, \" + \", TRUE)[[1]]\r\n\r\n [1] \"a\" \"c\" \"d\" \"e\"\r\n\r\nall.vars(parse(text = input))\r\n\r\n [1] \"a\" \"c\" \"d\" \"e\"\r\n\r\nMe, knowing too many useless things (and knowing that the the user already has the best answers), suggested a quirky alternative:8\r\n\r\nThis is super off-label usage but you can also use R’s formula utilities to parse this:9\r\n\r\n\r\n\r\nattr(terms(reformulate(input)), \"term.labels\")\r\n\r\n [1] \"a\" \"c\" \"d\" \"e\"\r\n\r\nTo my surprise, the response I got was:\r\n\r\nLovely! These definitely originated from formula ages ago so it’s actually not far off-label at all 🙂\r\n\r\n\r\nEspecially before slack deletes the old messages.↩︎\r\nIn R, you can use tools::file_ext() or fs::path_ext().↩︎\r\nhttps://en.wikipedia.org/wiki/Copypasta↩︎\r\nGood luck trying to explain the actual error message. Especially closure, a kind of weird vocabulary in R (fun fact - the first edition of Advanced R used to have a section on closure which is absent in the second edition probably because “In R, almost every function is a closure”).↩︎\r\nParadoxically, XY problems sometimes arise when inexperienced users try to come up with a reprex. They might capture the error/problem too narrowly, such that the more important broader context is left out.↩︎\r\nOr the number of distinct combinations between PANEL and group, as in nlevels(interaction(PANEL, group, drop = TRUE)). But of course that’s overkill and only of interest for “theoretical purity”.↩︎\r\nAnd I like the R4DS slack because it doesn’t have “internet points.” There is status (moderator) though I don’t wear the badge (literally - it’s an emoji).↩︎\r\nActually I only thought of this because I’d been writing a statistical package that required some nasty metaprogramming with the formula object.↩︎\r\nThe significance of this solution building on top of R’s formula utilities is that it will also parse stuff like \"a*b\" as c(\"a\", \"b\", \"a:b\"). So given that the inputs originated as R formulas (as the user later clarifies), this is the principled approach.↩︎\r\n",
"preview": "posts/2023-07-09-x-y-problem/preview.png",
- "last_modified": "2023-07-10T01:24:43-07:00",
+ "last_modified": "2023-07-10T04:24:43-04:00",
"input_file": {},
"preview_width": 238,
"preview_height": 205
@@ -81,7 +81,7 @@
],
"contents": "\r\n\r\nContents\r\nIntro\r\nSpecial properties of dplyr::slice()\r\nBasic usage\r\nRe-imagining slice() with data-masking\r\nSpecial properties of slice()\r\n\r\nA gallery of row operations with slice()\r\nRepeat rows (in place)\r\nSubset a selection of rows + the following row\r\nSubset a selection of rows + multiple following rows\r\nFilter (and encode) neighboring rows\r\nWindowed min/max/median (etc.)\r\nEvenly distributed row shuffling of balanced categories\r\nInserting a new row at specific intervals\r\nEvenly distributed row shuffling of unequal categories\r\n\r\nConclusion\r\n\r\nIntro\r\nIn data wrangling, there are a handful of classes of operations on data frames that we think of as theoretically well-defined and tackling distinct problems. To name a few, these include subsetting, joins, split-apply-combine, pairwise operations, nested-column workflows, and so on.\r\nAgainst this rich backdrop, there’s one aspect of data wrangling that doesn’t receive as much attention: ordering of rows. This isn’t necessarily surprising - we often think of row order as an auxiliary attribute of data frames since they don’t speak to the content of the data, per se. I think we all share the intuition that two dataframe that differ only in row order are practically the same for most analysis purposes.\r\nExcept when they aren’t.\r\nIn this blog post I want to talk about a few, somewhat esoteric cases of what I like to call row-relational operations. My goal is to try to motivate row-relational operations as a full-blown class of data wrangling operation that includes not only row ordering, but also sampling, shuffling, repeating, interweaving, and so on (I’ll go over all of these later).\r\nWithout spoiling too much, I believe that dplyr::slice() offers a powerful context for operations over row indices, even those that at first seem to lack a “tidy” solution. You may already know slice() as an indexing function, but my hope is to convince you that it can do so much more.\r\nLet’s start by first talking about some special properties of dplyr::slice(), and then see how we can use it for various row-relational operations.\r\nSpecial properties of dplyr::slice()\r\nBasic usage\r\nFor the following demonstration, I’ll use a small subset of the dplyr::starwars dataset:\r\n\r\n\r\nstarwars_sm <- dplyr::starwars[1:10, 1:3]\r\nstarwars_sm\r\n\r\n # A tibble: 10 × 3\r\n name height mass\r\n \r\n 1 Luke Skywalker 172 77\r\n 2 C-3PO 167 75\r\n 3 R2-D2 96 32\r\n 4 Darth Vader 202 136\r\n 5 Leia Organa 150 49\r\n 6 Owen Lars 178 120\r\n 7 Beru Whitesun lars 165 75\r\n 8 R5-D4 97 32\r\n 9 Biggs Darklighter 183 84\r\n 10 Obi-Wan Kenobi 182 77\r\n\r\n1) Row selection\r\nslice() is a row indexing verb - if you pass it a vector of integers, it subsets data frame rows:\r\n\r\n\r\nstarwars_sm |> \r\n slice(1:6) # First six rows\r\n\r\n # A tibble: 6 × 3\r\n name height mass\r\n \r\n 1 Luke Skywalker 172 77\r\n 2 C-3PO 167 75\r\n 3 R2-D2 96 32\r\n 4 Darth Vader 202 136\r\n 5 Leia Organa 150 49\r\n 6 Owen Lars 178 120\r\n\r\nLike other dplyr verbs with mutate-semantics, you can use context-dependent expressions inside slice(). For example, you can use n() to grab the last row (or last couple of rows):\r\n\r\n\r\nstarwars_sm |> \r\n slice( n() ) # Last row\r\n\r\n # A tibble: 1 × 3\r\n name height mass\r\n \r\n 1 Obi-Wan Kenobi 182 77\r\n\r\nstarwars_sm |> \r\n slice( n() - 2:0 ) # Last three rows\r\n\r\n # A tibble: 3 × 3\r\n name height mass\r\n \r\n 1 R5-D4 97 32\r\n 2 Biggs Darklighter 183 84\r\n 3 Obi-Wan Kenobi 182 77\r\n\r\nAnother context-dependent expression that comes in handy is row_number(), which returns all row indices. Using it inside slice() essentially performs an identity transformation:\r\n\r\n\r\nidentical(\r\n starwars_sm,\r\n starwars_sm |> slice( row_number() )\r\n)\r\n\r\n [1] TRUE\r\n\r\nLastly, similar to in select(), you can use - for negative indexing (to remove rows):\r\n\r\n\r\nidentical(\r\n starwars_sm |> slice(1:3), # First three rows\r\n starwars_sm |> slice(-(4:n())) # All rows except fourth row to last row\r\n)\r\n\r\n [1] TRUE\r\n\r\n2) Dynamic dots\r\nslice() supports dynamic dots. If you pass row indices into multiple argument positions, slice() will concatenate them for you:\r\n\r\n\r\nidentical(\r\n starwars_sm |> slice(1:6),\r\n starwars_sm |> slice(1, 2:4, 5, 6)\r\n)\r\n\r\n [1] TRUE\r\n\r\nIf you have a list() of row indices, you can use the splice operator !!! to spread them out:\r\n\r\n\r\nstarwars_sm |> \r\n slice( !!!list(1, 2:4, 5, 6) )\r\n\r\n # A tibble: 6 × 3\r\n name height mass\r\n \r\n 1 Luke Skywalker 172 77\r\n 2 C-3PO 167 75\r\n 3 R2-D2 96 32\r\n 4 Darth Vader 202 136\r\n 5 Leia Organa 150 49\r\n 6 Owen Lars 178 120\r\n\r\nThe above call to slice() evaluates to the following after splicing:\r\n\r\n\r\nrlang::expr( slice(!!!list(1, 2:4, 5, 6)) )\r\n\r\n slice(1, 2:4, 5, 6)\r\n\r\n3) Row ordering\r\nslice() respects the order in which you supplied the row indices:\r\n\r\n\r\nstarwars_sm |> \r\n slice(3, 1, 2, 5)\r\n\r\n # A tibble: 4 × 3\r\n name height mass\r\n \r\n 1 R2-D2 96 32\r\n 2 Luke Skywalker 172 77\r\n 3 C-3PO 167 75\r\n 4 Leia Organa 150 49\r\n\r\nThis means you can do stuff like random sampling with sample():\r\n\r\n\r\nstarwars_sm |> \r\n slice( sample(n()) )\r\n\r\n # A tibble: 10 × 3\r\n name height mass\r\n \r\n 1 Obi-Wan Kenobi 182 77\r\n 2 Owen Lars 178 120\r\n 3 Leia Organa 150 49\r\n 4 Darth Vader 202 136\r\n 5 Luke Skywalker 172 77\r\n 6 R5-D4 97 32\r\n 7 C-3PO 167 75\r\n 8 Beru Whitesun lars 165 75\r\n 9 Biggs Darklighter 183 84\r\n 10 R2-D2 96 32\r\n\r\nYou can also shuffle a subset of rows (ex: just the first five):\r\n\r\n\r\nstarwars_sm |> \r\n slice( sample(5), 6:n() )\r\n\r\n # A tibble: 10 × 3\r\n name height mass\r\n \r\n 1 C-3PO 167 75\r\n 2 Leia Organa 150 49\r\n 3 R2-D2 96 32\r\n 4 Darth Vader 202 136\r\n 5 Luke Skywalker 172 77\r\n 6 Owen Lars 178 120\r\n 7 Beru Whitesun lars 165 75\r\n 8 R5-D4 97 32\r\n 9 Biggs Darklighter 183 84\r\n 10 Obi-Wan Kenobi 182 77\r\n\r\nOr reorder all rows by their indices (ex: in reverse):\r\n\r\n\r\nstarwars_sm |> \r\n slice( rev(row_number()) )\r\n\r\n # A tibble: 10 × 3\r\n name height mass\r\n \r\n 1 Obi-Wan Kenobi 182 77\r\n 2 Biggs Darklighter 183 84\r\n 3 R5-D4 97 32\r\n 4 Beru Whitesun lars 165 75\r\n 5 Owen Lars 178 120\r\n 6 Leia Organa 150 49\r\n 7 Darth Vader 202 136\r\n 8 R2-D2 96 32\r\n 9 C-3PO 167 75\r\n 10 Luke Skywalker 172 77\r\n\r\n4) Out-of-bounds handling\r\nIf you pass a row index that’s out of bounds, slice() returns a 0-row data frame:\r\n\r\n\r\nstarwars_sm |> \r\n slice( n() + 1 ) # Select the row after the last row\r\n\r\n # A tibble: 0 × 3\r\n # ℹ 3 variables: name , height , mass \r\n\r\nWhen mixed with valid row indices, out-of-bounds indices are simply ignored (much 💜 for this behavior):\r\n\r\n\r\nstarwars_sm |> \r\n slice(\r\n 0, # 0th row - ignored\r\n 1:3, # first three rows\r\n n() + 1 # 1 after last row - ignored\r\n )\r\n\r\n # A tibble: 3 × 3\r\n name height mass\r\n \r\n 1 Luke Skywalker 172 77\r\n 2 C-3PO 167 75\r\n 3 R2-D2 96 32\r\n\r\nThis lets you do funky stuff like select all even numbered rows by passing slice() all row indices times 2:\r\n\r\n\r\nstarwars_sm |> \r\n slice( row_number() * 2 ) # Add `- 1` at the end for *odd* rows!\r\n\r\n # A tibble: 5 × 3\r\n name height mass\r\n \r\n 1 C-3PO 167 75\r\n 2 Darth Vader 202 136\r\n 3 Owen Lars 178 120\r\n 4 R5-D4 97 32\r\n 5 Obi-Wan Kenobi 182 77\r\n\r\nRe-imagining slice() with data-masking\r\nslice() is already pretty neat as it is, but that’s just the tip of the iceberg.\r\nThe really cool, under-rated feature of slice() is that it’s data-masked, meaning that you can reference column vectors as if they’re variables. Another way of describing this property of slice() is to say that it has mutate-semantics.\r\nAt a very basic level, this means that slice() can straightforwardly replicate the behavior of some dplyr verbs like arrange() and filter()!\r\nslice() as arrange()\r\nFrom our starwars_sm data, if we want to sort by height we can use arrange():\r\n\r\n\r\nstarwars_sm |> \r\n arrange(height)\r\n\r\n # A tibble: 10 × 3\r\n name height mass\r\n \r\n 1 R2-D2 96 32\r\n 2 R5-D4 97 32\r\n 3 Leia Organa 150 49\r\n 4 Beru Whitesun lars 165 75\r\n 5 C-3PO 167 75\r\n 6 Luke Skywalker 172 77\r\n 7 Owen Lars 178 120\r\n 8 Obi-Wan Kenobi 182 77\r\n 9 Biggs Darklighter 183 84\r\n 10 Darth Vader 202 136\r\n\r\nBut we can also do this with slice() to the same effect, using order():\r\n\r\n\r\nstarwars_sm |> \r\n slice( order(height) )\r\n\r\n # A tibble: 10 × 3\r\n name height mass\r\n \r\n 1 R2-D2 96 32\r\n 2 R5-D4 97 32\r\n 3 Leia Organa 150 49\r\n 4 Beru Whitesun lars 165 75\r\n 5 C-3PO 167 75\r\n 6 Luke Skywalker 172 77\r\n 7 Owen Lars 178 120\r\n 8 Obi-Wan Kenobi 182 77\r\n 9 Biggs Darklighter 183 84\r\n 10 Darth Vader 202 136\r\n\r\nThis is conceptually equivalent to combining the following 2-step process:\r\n\r\n\r\nordered_val_ind <- order(starwars_sm$height)\r\n ordered_val_ind\r\n\r\n [1] 3 8 5 7 2 1 6 10 9 4\r\n\r\n\r\n\r\nstarwars_sm |> \r\n slice( ordered_val_ind )\r\n\r\n # A tibble: 10 × 3\r\n name height mass\r\n \r\n 1 R2-D2 96 32\r\n 2 R5-D4 97 32\r\n 3 Leia Organa 150 49\r\n 4 Beru Whitesun lars 165 75\r\n 5 C-3PO 167 75\r\n 6 Luke Skywalker 172 77\r\n 7 Owen Lars 178 120\r\n 8 Obi-Wan Kenobi 182 77\r\n 9 Biggs Darklighter 183 84\r\n 10 Darth Vader 202 136\r\n\r\nslice() as filter()\r\nWe can also use slice() to filter(), using which():\r\n\r\n\r\nidentical(\r\n starwars_sm |> filter( height > 150 ),\r\n starwars_sm |> slice( which(height > 150) )\r\n)\r\n\r\n [1] TRUE\r\n\r\nThus, we can think of filter() and slice() as two sides of the same coin:\r\nfilter() takes a logical vector that’s the same length as the number of rows in the data frame\r\nslice() takes an integer vector that’s a (sub)set of a data frame’s row indices.\r\nTo put it more concretely, this logical vector was being passed to the above filter() call:\r\n\r\n\r\nstarwars_sm$height > 150\r\n\r\n [1] TRUE TRUE FALSE TRUE FALSE TRUE TRUE FALSE TRUE TRUE\r\n\r\nWhile this integer vector was being passed to the above slice() call, where which() returns the position of TRUE values, given a logical vector:\r\n\r\n\r\nwhich( starwars_sm$height > 150 )\r\n\r\n [1] 1 2 4 6 7 9 10\r\n\r\nSpecial properties of slice()\r\nThis re-imagined slice() that heavily exploits data-masking gives us two interesting properties:\r\nWe can work with sets of row indices that need not to be the same length as the data frame (vs. filter()).\r\nWe can work with row indices as integers, which are legible to arithmetic operations (ex: + and *)\r\nTo grok the significance of working with rows as integer sets, let’s work through some examples where slice() comes in very handy.\r\nA gallery of row operations with slice()\r\nRepeat rows (in place)\r\nIn {tidyr}, there’s a function called uncount() which does the opposite of dplyr::count():\r\n\r\n\r\nlibrary(tidyr)\r\n# Example from `tidyr::uncount()` docs\r\nuncount_df <- tibble(x = c(\"a\", \"b\"), n = c(1, 2))\r\nuncount_df\r\n\r\n # A tibble: 2 × 2\r\n x n\r\n \r\n 1 a 1\r\n 2 b 2\r\n\r\nuncount_df |> \r\n uncount(n)\r\n\r\n # A tibble: 3 × 1\r\n x \r\n \r\n 1 a \r\n 2 b \r\n 3 b\r\n\r\nWe can mimic this behavior with slice(), using rep(times = ...):\r\n\r\n\r\nrep(1:nrow(uncount_df), times = uncount_df$n)\r\n\r\n [1] 1 2 2\r\n\r\nuncount_df |> \r\n slice( rep(row_number(), times = n) ) |> \r\n select( -n )\r\n\r\n # A tibble: 3 × 1\r\n x \r\n \r\n 1 a \r\n 2 b \r\n 3 b\r\n\r\nWhat if instead of a whole column storing that information, we only have information about row position?\r\nLet’s say we want to duplicate the rows of starwars_sm at the repeat_at positions:\r\n\r\n\r\nrepeat_at <- sample(5, 2)\r\nrepeat_at\r\n\r\n [1] 4 5\r\n\r\nIn slice(), you’d just select all rows plus those additional rows, then sort the integer row indices:\r\n\r\n\r\nstarwars_sm |> \r\n slice( sort(c(row_number(), repeat_at)) )\r\n\r\n # A tibble: 12 × 3\r\n name height mass\r\n \r\n 1 Luke Skywalker 172 77\r\n 2 C-3PO 167 75\r\n 3 R2-D2 96 32\r\n 4 Darth Vader 202 136\r\n 5 Darth Vader 202 136\r\n 6 Leia Organa 150 49\r\n 7 Leia Organa 150 49\r\n 8 Owen Lars 178 120\r\n 9 Beru Whitesun lars 165 75\r\n 10 R5-D4 97 32\r\n 11 Biggs Darklighter 183 84\r\n 12 Obi-Wan Kenobi 182 77\r\n\r\nWhat if we also separately have information about how much to repeat those rows by?\r\n\r\n\r\nrepeat_by <- c(3, 4)\r\n\r\n\r\nYou can apply the same rep() method for just the subset of rows to repeat:\r\n\r\n\r\nstarwars_sm |> \r\n slice( sort(c(row_number(), rep(repeat_at, times = repeat_by - 1))) )\r\n\r\n # A tibble: 15 × 3\r\n name height mass\r\n \r\n 1 Luke Skywalker 172 77\r\n 2 C-3PO 167 75\r\n 3 R2-D2 96 32\r\n 4 Darth Vader 202 136\r\n 5 Darth Vader 202 136\r\n 6 Darth Vader 202 136\r\n 7 Leia Organa 150 49\r\n 8 Leia Organa 150 49\r\n 9 Leia Organa 150 49\r\n 10 Leia Organa 150 49\r\n 11 Owen Lars 178 120\r\n 12 Beru Whitesun lars 165 75\r\n 13 R5-D4 97 32\r\n 14 Biggs Darklighter 183 84\r\n 15 Obi-Wan Kenobi 182 77\r\n\r\nCircling back to uncount(), you could also initialize a vector of 1s and replace() where the rows should be repeated:\r\n\r\n\r\nstarwars_sm |> \r\n uncount( replace(rep(1, n()), repeat_at, repeat_by) )\r\n\r\n # A tibble: 15 × 3\r\n name height mass\r\n \r\n 1 Luke Skywalker 172 77\r\n 2 C-3PO 167 75\r\n 3 R2-D2 96 32\r\n 4 Darth Vader 202 136\r\n 5 Darth Vader 202 136\r\n 6 Darth Vader 202 136\r\n 7 Leia Organa 150 49\r\n 8 Leia Organa 150 49\r\n 9 Leia Organa 150 49\r\n 10 Leia Organa 150 49\r\n 11 Owen Lars 178 120\r\n 12 Beru Whitesun lars 165 75\r\n 13 R5-D4 97 32\r\n 14 Biggs Darklighter 183 84\r\n 15 Obi-Wan Kenobi 182 77\r\n\r\nSubset a selection of rows + the following row\r\nRow order can sometimes encode a meaningful continuous measure, like time.\r\nTake for example this subset of the flights dataset in {nycflights13}:\r\n\r\n\r\nflights_df <- nycflights13::flights |> \r\n filter(month == 3, day == 3, origin == \"JFK\") |> \r\n select(dep_time, flight, carrier) |> \r\n slice(1:100) |> \r\n arrange(dep_time)\r\nflights_df\r\n\r\n # A tibble: 100 × 3\r\n dep_time flight carrier\r\n \r\n 1 535 1141 AA \r\n 2 551 5716 EV \r\n 3 555 145 B6 \r\n 4 556 208 B6 \r\n 5 556 79 B6 \r\n 6 601 501 B6 \r\n 7 604 725 B6 \r\n 8 606 135 B6 \r\n 9 606 600 UA \r\n 10 607 829 US \r\n # ℹ 90 more rows\r\n\r\nHere, the rows are ordered by dep_time, such that given a row, the next row is a data point for the next flight that departed from the airport.\r\nAnd let’s say we’re interested in flights that took off immediately after American Airlines (\"AA\") flights. Given what we just noted about the ordering of rows in the data frame, we can do this in slice() by adding 1 to the row index of AA flights:\r\n\r\n\r\nflights_df |> \r\n slice( which(carrier == \"AA\") + 1 )\r\n\r\n # A tibble: 14 × 3\r\n dep_time flight carrier\r\n \r\n 1 551 5716 EV \r\n 2 627 905 B6 \r\n 3 652 117 B6 \r\n 4 714 825 AA \r\n 5 717 987 B6 \r\n 6 724 11 VX \r\n 7 742 183 DL \r\n 8 802 655 AA \r\n 9 805 2143 DL \r\n 10 847 59 B6 \r\n 11 858 647 AA \r\n 12 859 120 DL \r\n 13 1031 179 AA \r\n 14 1036 641 B6\r\n\r\nWhat if we also want to keep observations for the preceding AA flights as well? We can just stick which(carrier == \"AA\") inside slice() too:\r\n\r\n\r\nflights_df |> \r\n slice(\r\n which(carrier == \"AA\"),\r\n which(carrier == \"AA\") + 1\r\n )\r\n\r\n # A tibble: 28 × 3\r\n dep_time flight carrier\r\n \r\n 1 535 1141 AA \r\n 2 626 413 AA \r\n 3 652 1815 AA \r\n 4 711 443 AA \r\n 5 714 825 AA \r\n 6 724 33 AA \r\n 7 739 59 AA \r\n 8 802 1838 AA \r\n 9 802 655 AA \r\n 10 843 1357 AA \r\n # ℹ 18 more rows\r\n\r\nBut now the rows are now ordered such that all the AA flights come before the other flights! How can we preserve the original order of increasing dep_time?\r\nWe could reconstruct the initial row order by piping the result into arrange(dep_time) again, but the simplest solution would be to concatenate the set of row indices and sort() them, since the output of which() is already integer!\r\n\r\n\r\nflights_df |> \r\n slice(\r\n sort(c(\r\n which(carrier == \"AA\"),\r\n which(carrier == \"AA\") + 1\r\n ))\r\n )\r\n\r\n # A tibble: 28 × 3\r\n dep_time flight carrier\r\n \r\n 1 535 1141 AA \r\n 2 551 5716 EV \r\n 3 626 413 AA \r\n 4 627 905 B6 \r\n 5 652 1815 AA \r\n 6 652 117 B6 \r\n 7 711 443 AA \r\n 8 714 825 AA \r\n 9 714 825 AA \r\n 10 717 987 B6 \r\n # ℹ 18 more rows\r\n\r\nNotice how the 8th and 9th rows are repeated here - that’s because 2 AA flights departed in a row (ha!). We can use unique() to remove duplicate rows in the same call to slice():\r\n\r\n\r\nflights_df |> \r\n slice(\r\n unique(sort(c(\r\n which(carrier == \"AA\"),\r\n which(carrier == \"AA\") + 1\r\n )))\r\n )\r\n\r\n # A tibble: 24 × 3\r\n dep_time flight carrier\r\n \r\n 1 535 1141 AA \r\n 2 551 5716 EV \r\n 3 626 413 AA \r\n 4 627 905 B6 \r\n 5 652 1815 AA \r\n 6 652 117 B6 \r\n 7 711 443 AA \r\n 8 714 825 AA \r\n 9 717 987 B6 \r\n 10 724 33 AA \r\n # ℹ 14 more rows\r\n\r\nImportantly, we can do all of this inside slice() because we’re working with integer sets. The integer part allows us to do things like + 1 and sort(), while the set part allows us to combine with c() and remove duplicates with unique().\r\nSubset a selection of rows + multiple following rows\r\nIn this example, let’s problematize our approach with the repeated which() calls in our previous solution.\r\nImagine another scenario where we want to filter for all AA flights and three subsequent flights for each.\r\nDo we need to write the solution out like this? That’s a lot of repetition!\r\n\r\n\r\nflights_df |> \r\n slice(\r\n which(carrier == \"AA\"),\r\n which(carrier == \"AA\") + 1,\r\n which(carrier == \"AA\") + 2,\r\n which(carrier == \"AA\") + 3\r\n )\r\n\r\n\r\nYou might think we can get away with + 0:3, but it doesn’t work as we’d like. The + just forces 0:3 to be (partially) recycled to the same length as carrier for element-wise addition:\r\n\r\n\r\nwhich(flights_df$carrier == \"AA\") + 0:3\r\n\r\n Warning in which(flights_df$carrier == \"AA\") + 0:3: longer object length is not\r\n a multiple of shorter object length\r\n [1] 1 14 20 27 25 28 34 40 38 62 66 68 91 93\r\n\r\nIf only we can get the outer sum of the two arrays, 0:3 and which(carrier == \"AA\") … Oh wait, we can - that’s what outer() does!\r\n\r\n\r\nouter(0:3, which(flights_df$carrier == \"AA\"), `+`)\r\n\r\n [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]\r\n [1,] 1 13 18 24 25 27 32 37 38 61 64 65 91 92\r\n [2,] 2 14 19 25 26 28 33 38 39 62 65 66 92 93\r\n [3,] 3 15 20 26 27 29 34 39 40 63 66 67 93 94\r\n [4,] 4 16 21 27 28 30 35 40 41 64 67 68 94 95\r\n\r\nThis is essentially the repeated which() vectors stacked on top of each other, but as a matrix:\r\n\r\n\r\nprint( which(flights_df$carrier == \"AA\") )\r\nprint( which(flights_df$carrier == \"AA\") + 1 )\r\nprint( which(flights_df$carrier == \"AA\") + 2 )\r\nprint( which(flights_df$carrier == \"AA\") + 3 )\r\n\r\n [1] 1 13 18 24 25 27 32 37 38 61 64 65 91 92\r\n [1] 2 14 19 25 26 28 33 38 39 62 65 66 92 93\r\n [1] 3 15 20 26 27 29 34 39 40 63 66 67 93 94\r\n [1] 4 16 21 27 28 30 35 40 41 64 67 68 94 95\r\n\r\nThe fact that outer() returns all the relevant row indices inside a single matrix is nice because we can collect the indices column-by-column, preserving row order. Matrices, like data frames, are column-major, so coercing a matrix to a vector collapses it column-wise:\r\n\r\n\r\nas.integer( outer(0:3, which(flights_df$carrier == \"AA\"), `+`) )\r\n\r\n [1] 1 2 3 4 13 14 15 16 18 19 20 21 24 25 26 27 25 26 27 28 27 28 29 30 32\r\n [26] 33 34 35 37 38 39 40 38 39 40 41 61 62 63 64 64 65 66 67 65 66 67 68 91 92\r\n [51] 93 94 92 93 94 95\r\n\r\n\r\nOther ways to coerce matrix to vector\r\nThere are two other options for coercing a matrix to vector - c() and as.vector(). I like to stick with as.integer() because that enforces integer type (which makes sense for row indices), and c() can be nice because it’s less to type (although it’s off-label usage):\r\n\r\n\r\n# Not run, but equivalent to `as.integer()` method\r\nas.vector( outer(0:3, which(flights_df$carrier == \"AA\"), `+`) )\r\nc( outer(0:3, which(flights_df$carrier == \"AA\"), `+`) )\r\n\r\n\r\nSomewhat relatedly - and this only works inside the tidy-eval context of slice() - you can get a similar effect of “collapsing” a matrix using the splice operator !!!:\r\n\r\n\r\nseq_matrix <- matrix(1:9, byrow = TRUE, nrow = 3)\r\nas.integer(seq_matrix)\r\n\r\n [1] 1 4 7 2 5 8 3 6 9\r\n\r\nidentical(\r\n mtcars |> slice( as.vector(seq_matrix) ),\r\n mtcars |> slice( !!!seq_matrix )\r\n)\r\n\r\n [1] TRUE\r\n\r\nHere, the !!!seq_matrix was slotting each individual “cell” as argument to slice():\r\n\r\n\r\nrlang::expr( slice(!!!seq_matrix) )\r\n\r\n slice(1L, 4L, 7L, 2L, 5L, 8L, 3L, 6L, 9L)\r\n\r\nA big difference in behavior between as.integer() vs. !!! is that the latter works for lists of indices too, by slotting each element of the list as an argument to slice():\r\n\r\n\r\nseq_list <- list(c(1, 4, 7, 2), c(5, 8, 3, 6, 9))\r\nrlang::expr( slice( !!!seq_list ) )\r\n\r\n slice(c(1, 4, 7, 2), c(5, 8, 3, 6, 9))\r\n\r\nHowever, as you may already know, as.integer() cannot flatten lists:\r\n\r\n\r\nas.integer(seq_list)\r\n\r\n Error in eval(expr, envir, enclos): 'list' object cannot be coerced to type 'integer'\r\n\r\nNote that as.vector() and c() leaves lists as is, which is another reason to prefer as.integer() for type-checking:\r\n\r\n\r\nidentical(seq_list, as.vector(seq_list))\r\nidentical(seq_list, c(seq_list))\r\n\r\n [1] TRUE\r\n [1] TRUE\r\n\r\nFinally, back in our !!!seq_matrix example, we could have applied asplit(MARGIN = 2) to chunk the splicing by matrix column, although the overall effect would be the same:\r\n\r\n\r\nrlang::expr(slice( !!!seq_matrix ))\r\n\r\n slice(1L, 4L, 7L, 2L, 5L, 8L, 3L, 6L, 9L)\r\n\r\nrlang::expr(slice( !!!asplit(seq_matrix, 2) ))\r\n\r\n slice(c(1L, 4L, 7L), c(2L, 5L, 8L), c(3L, 6L, 9L))\r\n\r\nThis lets us ask questions like: Which AA flights departed within 3 flights of another AA flight?\r\n\r\n\r\nflights_df |> \r\n slice( as.integer( outer(0:3, which(carrier == \"AA\"), `+`) ) ) |> \r\n filter( carrier == \"AA\", duplicated(flight) ) |> \r\n distinct(flight, carrier)\r\n\r\n # A tibble: 6 × 2\r\n flight carrier\r\n \r\n 1 825 AA \r\n 2 33 AA \r\n 3 655 AA \r\n 4 1 AA \r\n 5 647 AA \r\n 6 179 AA\r\n\r\n\r\nSlicing all the way down: Case 1\r\nWith the addition of the .by argument to slice() in dplyr v1.10, we can re-write the above code as three calls to slice() (+ a call to select()):\r\n\r\n\r\nflights_df |> \r\n slice( as.integer( outer(0:3, which(carrier == \"AA\"), `+`) ) ) |> \r\n slice( which(carrier == \"AA\" & duplicated(flight)) ) |> # filter()\r\n slice( 1, .by = c(flight, carrier) ) |> # distinct()\r\n select(flight, carrier)\r\n\r\n # A tibble: 6 × 2\r\n flight carrier\r\n \r\n 1 825 AA \r\n 2 33 AA \r\n 3 655 AA \r\n 4 1 AA \r\n 5 647 AA \r\n 6 179 AA\r\n\r\nThe next example will demonstrate another, perhaps more practical usecase for outer() in slice().\r\nFilter (and encode) neighboring rows\r\nLet’s use a subset of the {gapminder} data set for this one. Here, we have data for each European country’s GDP-per-capita by year, between 1992 to 2007:\r\n\r\n\r\ngapminder_df <- gapminder::gapminder |> \r\n left_join(gapminder::country_codes, by = \"country\") |> # `multiple = \"all\"`\r\n filter(year >= 1992, continent == \"Europe\") |> \r\n select(country, country_code = iso_alpha, year, gdpPercap)\r\ngapminder_df\r\n\r\n # A tibble: 120 × 4\r\n country country_code year gdpPercap\r\n \r\n 1 Albania ALB 1992 2497.\r\n 2 Albania ALB 1997 3193.\r\n 3 Albania ALB 2002 4604.\r\n 4 Albania ALB 2007 5937.\r\n 5 Austria AUT 1992 27042.\r\n 6 Austria AUT 1997 29096.\r\n 7 Austria AUT 2002 32418.\r\n 8 Austria AUT 2007 36126.\r\n 9 Belgium BEL 1992 25576.\r\n 10 Belgium BEL 1997 27561.\r\n # ℹ 110 more rows\r\n\r\nThis time, let’s see the desired output (plot) first and build our way up. The goal is to plot the GDP growth of Germany over the years, and its yearly GDP neighbors side-by-side:\r\n\r\n\r\n\r\nFirst, let’s think about what a “GDP neighbor” means in row-relational terms. If you arranged the data by GDP, the GDP neighbors would be the rows that come immediately before and after the rows for Germany. You need to recalculate neighbors every year though, so this arrange() + slice() combo should happen by-year.\r\nWith that in mind, let’s set up a year grouping and arrange by gdpPercap within year:1\r\n\r\n\r\ngapminder_df |> \r\n group_by(year) |> \r\n arrange(gdpPercap, .by_group = TRUE)\r\n\r\n # A tibble: 120 × 4\r\n # Groups: year [4]\r\n country country_code year gdpPercap\r\n \r\n 1 Albania ALB 1992 2497.\r\n 2 Bosnia and Herzegovina BIH 1992 2547.\r\n 3 Turkey TUR 1992 5678.\r\n 4 Bulgaria BGR 1992 6303.\r\n 5 Romania ROU 1992 6598.\r\n 6 Montenegro MNE 1992 7003.\r\n 7 Poland POL 1992 7739.\r\n 8 Croatia HRV 1992 8448.\r\n 9 Serbia SRB 1992 9325.\r\n 10 Slovak Republic SVK 1992 9498.\r\n # ℹ 110 more rows\r\n\r\nNow within each year, we want to grab the row for Germany and its neighboring rows. We can do this by taking the outer() sum of -1:1 and the row indices for Germany:\r\n\r\n\r\ngapminder_df |> \r\n group_by(year) |> \r\n arrange(gdpPercap, .by_group = TRUE) |> \r\n slice( as.integer(outer( -1:1, which(country == \"Germany\"), `+` )) )\r\n\r\n # A tibble: 12 × 4\r\n # Groups: year [4]\r\n country country_code year gdpPercap\r\n \r\n 1 Denmark DNK 1992 26407.\r\n 2 Germany DEU 1992 26505.\r\n 3 Netherlands NLD 1992 26791.\r\n 4 Belgium BEL 1997 27561.\r\n 5 Germany DEU 1997 27789.\r\n 6 Iceland ISL 1997 28061.\r\n 7 United Kingdom GBR 2002 29479.\r\n 8 Germany DEU 2002 30036.\r\n 9 Belgium BEL 2002 30486.\r\n 10 France FRA 2007 30470.\r\n 11 Germany DEU 2007 32170.\r\n 12 United Kingdom GBR 2007 33203.\r\n\r\n\r\nSlicing all the way down: Case 2\r\nThe new .by argument in slice() comes in handy again here, allowing us to collapse the group_by() + arrange() combo into one slice() call:\r\n\r\n\r\ngapminder_df |> \r\n slice( order(gdpPercap), .by = year) |> \r\n slice( as.integer(outer( -1:1, which(country == \"Germany\"), `+` )) )\r\n\r\n # A tibble: 12 × 4\r\n country country_code year gdpPercap\r\n \r\n 1 Denmark DNK 1992 26407.\r\n 2 Germany DEU 1992 26505.\r\n 3 Netherlands NLD 1992 26791.\r\n 4 Belgium BEL 1997 27561.\r\n 5 Germany DEU 1997 27789.\r\n 6 Iceland ISL 1997 28061.\r\n 7 United Kingdom GBR 2002 29479.\r\n 8 Germany DEU 2002 30036.\r\n 9 Belgium BEL 2002 30486.\r\n 10 France FRA 2007 30470.\r\n 11 Germany DEU 2007 32170.\r\n 12 United Kingdom GBR 2007 33203.\r\n\r\nFor our purposes here we want actually the grouping to persist for the following mutate() call, but there may be other cases where you’d want to use slice(.by = ) for temporary grouping.\r\nNow we’re already starting to see the shape of the data that we want! The last step is to encode the relationship of each row to Germany - does a row represent Germany itself, or a country that’s one GDP ranking below or above Germany?\r\nContinuing with our grouped context, we make a new column grp that assigns a factor value \"lo\"-\"is\"-\"hi\" (for “lower” than Germany, “is” Germany and “higher” than Germany) to each country trio by year. Notice the use of fct_inorder() below - this ensures that the factor levels are in the order of their occurrence (necessary for the correct ordering of bars in geom_col() later):\r\n\r\n\r\ngapminder_df |> \r\n group_by(year) |> \r\n arrange(gdpPercap) |> \r\n slice( as.integer(outer( -1:1, which(country == \"Germany\"), `+` )) ) |> \r\n mutate(grp = forcats::fct_inorder(c(\"lo\", \"is\", \"hi\")))\r\n\r\n # A tibble: 12 × 5\r\n # Groups: year [4]\r\n country country_code year gdpPercap grp \r\n \r\n 1 Denmark DNK 1992 26407. lo \r\n 2 Germany DEU 1992 26505. is \r\n 3 Netherlands NLD 1992 26791. hi \r\n 4 Belgium BEL 1997 27561. lo \r\n 5 Germany DEU 1997 27789. is \r\n 6 Iceland ISL 1997 28061. hi \r\n 7 United Kingdom GBR 2002 29479. lo \r\n 8 Germany DEU 2002 30036. is \r\n 9 Belgium BEL 2002 30486. hi \r\n 10 France FRA 2007 30470. lo \r\n 11 Germany DEU 2007 32170. is \r\n 12 United Kingdom GBR 2007 33203. hi\r\n\r\nWe now have everything that’s necessary to make our desired plot, so we ungroup(), write some {ggplot2} code, and voila!\r\n\r\n\r\ngapminder_df |> \r\n group_by(year) |> \r\n arrange(gdpPercap) |> \r\n slice( as.integer(outer( -1:1, which(country == \"Germany\"), `+` )) ) |> \r\n mutate(grp = forcats::fct_inorder(c(\"lo\", \"is\", \"hi\"))) |> \r\n # Ungroup and make ggplot\r\n ungroup() |> \r\n ggplot(aes(as.factor(year), gdpPercap, group = grp)) +\r\n geom_col(aes(fill = grp == \"is\"), position = position_dodge()) +\r\n geom_text(\r\n aes(label = country_code),\r\n vjust = 1.3,\r\n position = position_dodge(width = .9)\r\n ) +\r\n scale_fill_manual(\r\n values = c(\"grey75\", \"steelblue\"),\r\n guide = guide_none()\r\n ) +\r\n theme_classic() +\r\n labs(x = \"Year\", y = \"GDP per capita\")\r\n\r\n\r\n\r\n\r\nSolving the harder version of the problem\r\nThe solution presented above relies on a fragile assumption that Germany will always have a higher and lower ranking GDP neighbor every year. But nothing about the problem description guarantees this, so how can we re-write our code to be more robust?\r\nFirst, let’s simulate a data where Germany is the lowest ranking country in 2002 and the highest ranking in 2007. In other words, Germany only has one GDP neighbor in those years:\r\n\r\n\r\ngapminder_harder_df <- gapminder_df |> \r\n slice( order(gdpPercap), .by = year) |> \r\n slice( as.integer(outer( -1:1, which(country == \"Germany\"), `+` )) ) |> \r\n slice( -7, -12 )\r\ngapminder_harder_df\r\n\r\n # A tibble: 10 × 4\r\n country country_code year gdpPercap\r\n \r\n 1 Denmark DNK 1992 26407.\r\n 2 Germany DEU 1992 26505.\r\n 3 Netherlands NLD 1992 26791.\r\n 4 Belgium BEL 1997 27561.\r\n 5 Germany DEU 1997 27789.\r\n 6 Iceland ISL 1997 28061.\r\n 7 Germany DEU 2002 30036.\r\n 8 Belgium BEL 2002 30486.\r\n 9 France FRA 2007 30470.\r\n 10 Germany DEU 2007 32170.\r\n\r\nGiven this data, we cannot assign the full, length-3 lo-is-hi factor by group, because the groups for year 2002 and 2007 only have 2 observations:\r\n\r\n\r\ngapminder_harder_df |> \r\n group_by(year) |> \r\n mutate(grp = forcats::fct_inorder(c(\"lo\", \"is\", \"hi\")))\r\n\r\n Error in `mutate()`:\r\n ℹ In argument: `grp = forcats::fct_inorder(c(\"lo\", \"is\", \"hi\"))`.\r\n ℹ In group 3: `year = 2002`.\r\n Caused by error:\r\n ! `grp` must be size 2 or 1, not 3.\r\n\r\nThe trick here is to turn each group of rows into an integer sequence where Germany is “anchored” to 2, and then use that vector to subset the lo-is-hi factor:\r\n\r\n\r\ngapminder_harder_df |> \r\n group_by(year) |> \r\n mutate(\r\n Germany_anchored_to_2 = row_number() - which(country == \"Germany\") + 2,\r\n grp = forcats::fct_inorder(c(\"lo\", \"is\", \"hi\"))[Germany_anchored_to_2]\r\n )\r\n\r\n # A tibble: 10 × 6\r\n # Groups: year [4]\r\n country country_code year gdpPercap Germany_anchored_to_2 grp \r\n \r\n 1 Denmark DNK 1992 26407. 1 lo \r\n 2 Germany DEU 1992 26505. 2 is \r\n 3 Netherlands NLD 1992 26791. 3 hi \r\n 4 Belgium BEL 1997 27561. 1 lo \r\n 5 Germany DEU 1997 27789. 2 is \r\n 6 Iceland ISL 1997 28061. 3 hi \r\n 7 Germany DEU 2002 30036. 2 is \r\n 8 Belgium BEL 2002 30486. 3 hi \r\n 9 France FRA 2007 30470. 1 lo \r\n 10 Germany DEU 2007 32170. 2 is\r\n\r\nWe find that the lessons of working with row indices from slice() translated to solving this complex mutate() problem - neat!\r\nWindowed min/max/median (etc.)\r\nLet’s say we have this small time series data, and we want to calculate a lagged 3-window moving minimum for the val column:\r\n\r\n\r\nts_df <- tibble(\r\n time = 1:6,\r\n val = sample(1:6 * 10)\r\n)\r\nts_df\r\n\r\n # A tibble: 6 × 2\r\n time val\r\n \r\n 1 1 50\r\n 2 2 40\r\n 3 3 60\r\n 4 4 30\r\n 5 5 20\r\n 6 6 10\r\n\r\nIf you’re new to window functions, think of them as a special kind of group_by() + summarize() where groups are chunks of observations along a (typically unique) continuous measure like time, and observations can be shared between groups.\r\nThere are several packages implementing moving/sliding/rolling window functions. My current favorite is {r2c} (see a review of other implementations therein), but I also like {slider} for an implementation that follows familiar “tidy” design principles:\r\n\r\n\r\nlibrary(slider)\r\nts_df |> \r\n mutate(moving_min = slide_min(val, before = 2L, complete = TRUE))\r\n\r\n # A tibble: 6 × 3\r\n time val moving_min\r\n \r\n 1 1 50 NA\r\n 2 2 40 NA\r\n 3 3 60 40\r\n 4 4 30 30\r\n 5 5 20 20\r\n 6 6 10 10\r\n\r\nMoving window is a general class of operations that encompass any arbitrary summary statistic - so not just min but other reducing functions like mean, standard deviation, etc. But what makes moving min (along with max, median, etc.) a particularly interesting case for our current discussion is that the value comes from an existing observation in the data. And if our time series is tidy, every observation makes up a row. See where I’m going with this?\r\nUsing outer() again, we can take the outer sum of all row indices of ts_df and -2:0. This gives us a matrix where each column represents a lagged size-3 moving window:\r\n\r\n\r\nwindows_3lag <- outer(-2:0, 1:nrow(ts_df), \"+\")\r\nwindows_3lag\r\n\r\n [,1] [,2] [,3] [,4] [,5] [,6]\r\n [1,] -1 0 1 2 3 4\r\n [2,] 0 1 2 3 4 5\r\n [3,] 1 2 3 4 5 6\r\n\r\nThe “lagged size-3” property of this moving window means that the first two windows are incomplete (consisting of less than 3 observations). We want to treat those as invalid, so we can drop the first two columns from our matrix:\r\n\r\n\r\nwindows_3lag[,-(1:2)]\r\n\r\n [,1] [,2] [,3] [,4]\r\n [1,] 1 2 3 4\r\n [2,] 2 3 4 5\r\n [3,] 3 4 5 6\r\n\r\nFor each remaining column, we want to grab the values of val at the corresponding row indices and find which row has the minimum val. In terms of code, we use apply() with MARGIN = 2L to column-wise apply a function where we use which.min() to find the location of the minimum val and convert it back to row index via subsetting:\r\n\r\n\r\nwindows_3lag[, -(1:2)] |> \r\n apply(MARGIN = 2L, \\(i) i[which.min(ts_df$val[i])])\r\n\r\n [1] 2 4 5 6\r\n\r\nNow let’s stick this inside slice(), exploiting the fact that it’s data-masked (ts_df$val can just be val) and exposes context-dependent expressions (1:nrow(ts_df) can just be row_number()):\r\n\r\n\r\nmoving_mins <- ts_df |> \r\n slice(\r\n outer(-2:0, row_number(), \"+\")[,-(1:2)] |> \r\n apply(MARGIN = 2L, \\(i) i[which.min(val[i])])\r\n )\r\nmoving_mins\r\n\r\n # A tibble: 4 × 2\r\n time val\r\n \r\n 1 2 40\r\n 2 4 30\r\n 3 5 20\r\n 4 6 10\r\n\r\nFrom here, we can grab the val column and pad it with NA to add our desired window_min column to the original data frame:\r\n\r\n\r\nts_df |> \r\n mutate(moving_min = c(NA, NA, moving_mins$val))\r\n\r\n # A tibble: 6 × 3\r\n time val moving_min\r\n \r\n 1 1 50 NA\r\n 2 2 40 NA\r\n 3 3 60 40\r\n 4 4 30 30\r\n 5 5 20 20\r\n 6 6 10 10\r\n\r\nAt this point you might think that this is a very round-about way of solving the same problem. But actually I think that it’s a faster route to solving a slightly more complicated problem - augmenting each observation of a data frame with information about comparison observations.\r\nFor example, our slice()-based solution sets us up nicely for also bringing along information about the time at which the moving_min occurred. After some rename()-ing and adding the original time information back in, we get back a relational data structure where time is a key shared with ts_df:\r\n\r\n\r\nmoving_mins2 <- moving_mins |> \r\n rename(moving_min_val = val, moving_min_time = time) |> \r\n mutate(time = ts_df$time[-(1:2)], .before = 1L)\r\nmoving_mins2\r\n\r\n # A tibble: 4 × 3\r\n time moving_min_time moving_min_val\r\n \r\n 1 3 2 40\r\n 2 4 4 30\r\n 3 5 5 20\r\n 4 6 6 10\r\n\r\nWe can then left-join this to the original data to augment it with information about both the value of the 3-window minimum and the time that the minimum occurred:\r\n\r\n\r\nleft_join(ts_df, moving_mins2, by = \"time\")\r\n\r\n # A tibble: 6 × 4\r\n time val moving_min_time moving_min_val\r\n \r\n 1 1 50 NA NA\r\n 2 2 40 NA NA\r\n 3 3 60 2 40\r\n 4 4 30 4 30\r\n 5 5 20 5 20\r\n 6 6 10 6 10\r\n\r\nThis is particularly useful if rows contain other useful information for comparison and you have memory to spare:\r\n\r\n\r\nts_wide_df <- ts_df |> \r\n mutate(\r\n col1 = rnorm(6),\r\n col2 = rnorm(6)\r\n )\r\nts_wide_df\r\n\r\n # A tibble: 6 × 4\r\n time val col1 col2\r\n \r\n 1 1 50 0.0183 0.00501\r\n 2 2 40 0.705 -0.0376 \r\n 3 3 60 -0.647 0.724 \r\n 4 4 30 0.868 -0.497 \r\n 5 5 20 0.376 0.0114 \r\n 6 6 10 0.310 0.00986\r\n\r\nThe below code augments each observation in the original ts_wide_df data with information about the corresponding 3-window moving min (columns prefixed with \"min3val_\")\r\n\r\n\r\nmoving_mins_wide <- ts_wide_df |> \r\n slice(\r\n outer(-2:0, row_number(), \"+\")[,-(1:2)] |> \r\n apply(MARGIN = 2L, \\(i) i[which.min(val[i])])\r\n ) |> \r\n rename_with(~ paste0(\"min3val_\", .x)) |> \r\n mutate(time = ts_wide_df$time[-(1:2)])\r\nleft_join(ts_wide_df, moving_mins_wide, by = \"time\")\r\n\r\n # A tibble: 6 × 8\r\n time val col1 col2 min3val_time min3val_val min3val_col1\r\n \r\n 1 1 50 0.0183 0.00501 NA NA NA \r\n 2 2 40 0.705 -0.0376 NA NA NA \r\n 3 3 60 -0.647 0.724 2 40 0.705\r\n 4 4 30 0.868 -0.497 4 30 0.868\r\n 5 5 20 0.376 0.0114 5 20 0.376\r\n 6 6 10 0.310 0.00986 6 10 0.310\r\n # ℹ 1 more variable: min3val_col2 \r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\nEvenly distributed row shuffling of balanced categories\r\nSometimes the ordering of rows in a data frame can be meaningful for an external application.\r\nFor example, many experiment-building platforms for psychology research require researchers to specify the running order of trials in an experiment via a csv, where each row represents a trial and each column represents information about the trial.\r\nSo an experiment testing the classic Stroop effect may have the following template:\r\n\r\n\r\nmismatch_trials <- tibble(\r\n item_id = 1:5,\r\n trial = \"mismatch\",\r\n word = c(\"red\", \"green\", \"purple\", \"brown\", \"blue\"),\r\n color = c(\"brown\", \"red\", \"green\", \"blue\", \"purple\")\r\n)\r\nmismatch_trials\r\n\r\n # A tibble: 5 × 4\r\n item_id trial word color \r\n \r\n 1 1 mismatch red brown \r\n 2 2 mismatch green red \r\n 3 3 mismatch purple green \r\n 4 4 mismatch brown blue \r\n 5 5 mismatch blue purple\r\n\r\nWe probably also want to mix in some control trials where the word and color do match:\r\n\r\n\r\nmatch_trials <- mismatch_trials |> \r\n mutate(trial = \"match\", color = word)\r\nmatch_trials\r\n\r\n # A tibble: 5 × 4\r\n item_id trial word color \r\n \r\n 1 1 match red red \r\n 2 2 match green green \r\n 3 3 match purple purple\r\n 4 4 match brown brown \r\n 5 5 match blue blue\r\n\r\nNow that we have all materials for our experiment, we next want the running order to interleave the match and mismatch trials.\r\nWe first add them together into a longer data frame:\r\n\r\n\r\nstroop_trials <- bind_rows(mismatch_trials, match_trials)\r\nstroop_trials\r\n\r\n # A tibble: 10 × 4\r\n item_id trial word color \r\n \r\n 1 1 mismatch red brown \r\n 2 2 mismatch green red \r\n 3 3 mismatch purple green \r\n 4 4 mismatch brown blue \r\n 5 5 mismatch blue purple\r\n 6 1 match red red \r\n 7 2 match green green \r\n 8 3 match purple purple\r\n 9 4 match brown brown \r\n 10 5 match blue blue\r\n\r\nAnd from here we can exploit the fact that all mismatch items come before match items, and that they share the same length of 5:\r\n\r\n\r\nstroop_trials |> \r\n slice( as.integer(outer(c(0, 5), 1:5, \"+\")) )\r\n\r\n # A tibble: 10 × 4\r\n item_id trial word color \r\n \r\n 1 1 mismatch red brown \r\n 2 1 match red red \r\n 3 2 mismatch green red \r\n 4 2 match green green \r\n 5 3 mismatch purple green \r\n 6 3 match purple purple\r\n 7 4 mismatch brown blue \r\n 8 4 match brown brown \r\n 9 5 mismatch blue purple\r\n 10 5 match blue blue\r\n\r\nThis relies on a strong assumptions about the row order in the original data, though. So a safer alternative is to represent the row indices for \"match\" and \"mismatch\" trials as rows of a matrix, and then collapse column-wise.\r\nLet’s try this outside of slice() first. We start with a call to sapply() to construct a matrix where the columns contain row indices for each unique category of trial:\r\n\r\n\r\nsapply(unique(stroop_trials$trial), \\(x) which(stroop_trials$trial == x))\r\n\r\n mismatch match\r\n [1,] 1 6\r\n [2,] 2 7\r\n [3,] 3 8\r\n [4,] 4 9\r\n [5,] 5 10\r\n\r\nThen we transpose the matrix with t(), which rotates it:\r\n\r\n\r\nt( sapply(unique(stroop_trials$trial), \\(x) which(stroop_trials$trial == x)) )\r\n\r\n [,1] [,2] [,3] [,4] [,5]\r\n mismatch 1 2 3 4 5\r\n match 6 7 8 9 10\r\n\r\nNow lets stick that inside slice, remembering to collapse the transposed matrix into vector:\r\n\r\n\r\ninterleaved_stroop_trials <- stroop_trials |> \r\n slice( as.integer(t(sapply(unique(trial), \\(x) which(trial == x)))) )\r\ninterleaved_stroop_trials\r\n\r\n # A tibble: 10 × 4\r\n item_id trial word color \r\n \r\n 1 1 mismatch red brown \r\n 2 1 match red red \r\n 3 2 mismatch green red \r\n 4 2 match green green \r\n 5 3 mismatch purple green \r\n 6 3 match purple purple\r\n 7 4 mismatch brown blue \r\n 8 4 match brown brown \r\n 9 5 mismatch blue purple\r\n 10 5 match blue blue\r\n\r\nAt the moment, we have both “red” word trails showing up together, and then the “green”s, the “purple”s, and so on. If we wanted to introduce some randomness to the presentation order within each type of trial, we can wrap the row indices in sample() to shuffle them first:\r\n\r\n\r\nshuffled_stroop_trials <- stroop_trials |> \r\n slice( as.integer(t(sapply(unique(trial), \\(x) sample(which(trial == x))))) )\r\nshuffled_stroop_trials\r\n\r\n # A tibble: 10 × 4\r\n item_id trial word color \r\n \r\n 1 1 mismatch red brown \r\n 2 5 match blue blue \r\n 3 2 mismatch green red \r\n 4 4 match brown brown \r\n 5 3 mismatch purple green \r\n 6 1 match red red \r\n 7 4 mismatch brown blue \r\n 8 3 match purple purple\r\n 9 5 mismatch blue purple\r\n 10 2 match green green\r\n\r\n\r\nInserting a new row at specific intervals\r\nContinuing with our Stroop experiment template example, let’s say we want to give participants a break every two trials.\r\nIn a matrix representation, this means constructing this 2-row matrix of row indices:\r\n\r\n\r\nmatrix(1:nrow(shuffled_stroop_trials), nrow = 2)\r\n\r\n [,1] [,2] [,3] [,4] [,5]\r\n [1,] 1 3 5 7 9\r\n [2,] 2 4 6 8 10\r\n\r\nAnd adding a row of that represent a separator/break, before collapsing column-wise:\r\n\r\n\r\nmatrix(1:nrow(shuffled_stroop_trials), nrow = 2) |> \r\n rbind(11)\r\n\r\n [,1] [,2] [,3] [,4] [,5]\r\n [1,] 1 3 5 7 9\r\n [2,] 2 4 6 8 10\r\n [3,] 11 11 11 11 11\r\n\r\nUsing slice, this means adding a row to the data representing a break trial first, and then adding a row to the row index matrix representing that row:\r\n\r\n\r\nstroop_with_breaks <- shuffled_stroop_trials |> \r\n add_row(trial = \"BREAK\") |> \r\n slice(\r\n matrix(row_number()[-n()], nrow = 2) |> \r\n rbind(n()) |> \r\n as.integer()\r\n )\r\nstroop_with_breaks\r\n\r\n # A tibble: 15 × 4\r\n item_id trial word color \r\n \r\n 1 1 mismatch red brown \r\n 2 5 match blue blue \r\n 3 NA BREAK \r\n 4 2 mismatch green red \r\n 5 4 match brown brown \r\n 6 NA BREAK \r\n 7 3 mismatch purple green \r\n 8 1 match red red \r\n 9 NA BREAK \r\n 10 4 mismatch brown blue \r\n 11 3 match purple purple\r\n 12 NA BREAK \r\n 13 5 mismatch blue purple\r\n 14 2 match green green \r\n 15 NA BREAK \r\n\r\nIf we don’t want a break after the last trial, we can use negative indexing with slice(-n()):\r\n\r\n\r\nstroop_with_breaks |> \r\n slice(-n())\r\n\r\n # A tibble: 14 × 4\r\n item_id trial word color \r\n \r\n 1 1 mismatch red brown \r\n 2 5 match blue blue \r\n 3 NA BREAK \r\n 4 2 mismatch green red \r\n 5 4 match brown brown \r\n 6 NA BREAK \r\n 7 3 mismatch purple green \r\n 8 1 match red red \r\n 9 NA BREAK \r\n 10 4 mismatch brown blue \r\n 11 3 match purple purple\r\n 12 NA BREAK \r\n 13 5 mismatch blue purple\r\n 14 2 match green green\r\n\r\nWhat about after 3 trials, where the number of trials (10) is not divisibly by 3? Can we still use a matrix?\r\nYes, you’d just need to explicitly fill in the “blanks”!\r\nConceptually, we want a matrix like this, where extra “cells” are padded with 0s (recall that 0s are ignored in slice()):\r\n\r\n\r\nmatrix(c(1:10, rep(0, 3 - 10 %% 3)), nrow = 3)\r\n\r\n [,1] [,2] [,3] [,4]\r\n [1,] 1 4 7 10\r\n [2,] 2 5 8 0\r\n [3,] 3 6 9 0\r\n\r\nAnd this is how that could be implemented inside slice(), minding the fact that adding the break trial increases original row count by 1:\r\n\r\n\r\nshuffled_stroop_trials |> \r\n add_row(trial = \"BREAK\") |> \r\n slice(\r\n c(seq_len(n()-1), rep(0, 3 - (n()-1) %% 3)) |> \r\n matrix(nrow = 3) |> \r\n rbind(n()) |> \r\n as.integer()\r\n ) |> \r\n slice(-n())\r\n\r\n # A tibble: 13 × 4\r\n item_id trial word color \r\n \r\n 1 1 mismatch red brown \r\n 2 5 match blue blue \r\n 3 2 mismatch green red \r\n 4 NA BREAK \r\n 5 4 match brown brown \r\n 6 3 mismatch purple green \r\n 7 1 match red red \r\n 8 NA BREAK \r\n 9 4 mismatch brown blue \r\n 10 3 match purple purple\r\n 11 5 mismatch blue purple\r\n 12 NA BREAK \r\n 13 2 match green green\r\n\r\nHow about inserting a break trial after every \"purple\" word trials?\r\nConceptually, we want a matrix that binds these two vectors as rows before collapsing:\r\n\r\n\r\nprint( 1:nrow(shuffled_stroop_trials) )\r\nprint(\r\n replace(rep(0, nrow(shuffled_stroop_trials)),\r\n which(shuffled_stroop_trials$word == \"purple\"), 11)\r\n)\r\n\r\n [1] 1 2 3 4 5 6 7 8 9 10\r\n [1] 0 0 0 0 11 0 0 11 0 0\r\n\r\nAnd this is how you could do that inside slice():\r\n\r\n\r\nshuffled_stroop_trials |> \r\n add_row(trial = \"BREAK\") |> \r\n slice(\r\n c(seq_len(n()-1), replace(rep(0, n()-1), which(word == \"purple\"), n())) |>\r\n matrix(nrow = 2, byrow = TRUE) |> \r\n as.integer()\r\n )\r\n\r\n # A tibble: 12 × 4\r\n item_id trial word color \r\n