From 6470e4d563055afcb2230473a5e0bb26f0206ce6 Mon Sep 17 00:00:00 2001 From: myushen Date: Thu, 18 Jul 2024 15:26:19 +1000 Subject: [PATCH 1/5] fix assay name for calculating CPM --- DESCRIPTION | 2 +- R/counts_per_million.R | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 34057d9..a2bd13e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: CuratedAtlasQueryR Title: Queries the Human Cell Atlas -Version: 1.3.7 +Version: 1.3.8 Authors@R: c( person( "Stefano", diff --git a/R/counts_per_million.R b/R/counts_per_million.R index db2a7de..6a88217 100644 --- a/R/counts_per_million.R +++ b/R/counts_per_million.R @@ -20,7 +20,8 @@ get_counts_per_million <- function(input_sce_obj, output_dir, hd5_file_dir) { col_sums <- colSums(as.matrix(assay(data))) selected_cols <- which(col_sums >0 & col_sums < Inf) - sce <- SingleCellExperiment(list(counts_per_million = scuttle::calculateCPM(data[,selected_cols ,drop=FALSE ], assay.type = "X"))) + assay_name <- assays(data) |> names() + sce <- SingleCellExperiment(list(counts_per_million = scuttle::calculateCPM(data[,selected_cols ,drop=FALSE ], assay.type = assay_name))) rownames(sce) <- rownames(data[,selected_cols ]) colnames(sce) <- colnames(data[,selected_cols ]) From 83cba33f268197d402f8f7283d73cb08245fc03a Mon Sep 17 00:00:00 2001 From: myushen Date: Mon, 22 Jul 2024 11:48:31 +1000 Subject: [PATCH 2/5] generate and import pseudobulk in the import API --- DESCRIPTION | 6 ++--- NAMESPACE | 1 + R/import_metadata_and_counts.R | 41 +++++++++++++++++++++++++++++++-- data/sample_sce_obj.rda | Bin 8212 -> 8328 bytes man/import_one_sce.Rd | 9 +++++++- tests/testthat/test-query.R | 15 +++++++++--- 6 files changed, 63 insertions(+), 9 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index a2bd13e..022d2fd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: CuratedAtlasQueryR Title: Queries the Human Cell Atlas -Version: 1.3.8 +Version: 1.4.8 Authors@R: c( person( "Stefano", @@ -95,7 +95,8 @@ Imports: duckdb, stringr, checkmate, - stringfish + stringfish, + tidySingleCellExperiment Suggests: zellkonverter, rmarkdown, @@ -107,7 +108,6 @@ Suggests: spelling, forcats, ggplot2, - tidySingleCellExperiment, rprojroot, openssl, scuttle, diff --git a/NAMESPACE b/NAMESPACE index 940ac99..b0009d4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -83,6 +83,7 @@ importFrom(stringr,str_detect) importFrom(stringr,str_remove_all) importFrom(stringr,str_replace_all) importFrom(tibble,column_to_rownames) +importFrom(tidySingleCellExperiment,aggregate_cells) importFrom(tools,R_user_dir) importFrom(utils,head) importFrom(utils,packageName) diff --git a/R/import_metadata_and_counts.R b/R/import_metadata_and_counts.R index 04882ab..e85d846 100644 --- a/R/import_metadata_and_counts.R +++ b/R/import_metadata_and_counts.R @@ -5,6 +5,9 @@ #' @param cache_dir Optional character vector of length 1. A file path on #' your local system to a directory (not a file) that will be used to store #' `metadata.parquet` +#' @param pseudobulk Optional character. Set to TRUE for generating and importing pseudobulk, +#' the metadata slot of which must contain `file_id`, `cell_type_harmonised` and `sample_` +#' #' @export #' @return A metadata.parquet strip from the SingleCellExperiment object. #' Directories store counts and counts per million in the provided cache directory. @@ -16,6 +19,7 @@ #' @importFrom S4Vectors metadata metadata<- #' @importFrom SummarizedExperiment assay #' @importFrom stringr str_detect +#' @importFrom tidySingleCellExperiment aggregate_cells #' @examples #' data(sample_sce_obj) #' import_one_sce(sample_sce_obj, @@ -27,7 +31,8 @@ #' @source [Mangiola et al.,2023](https://www.biorxiv.org/content/10.1101/2023.06.08.542671v3) import_one_sce <- function( sce_obj, - cache_dir = get_default_cache_dir() + cache_dir = get_default_cache_dir(), + pseudobulk = FALSE ) { original_dir <- file.path(cache_dir, "original") @@ -63,6 +68,19 @@ import_one_sce <- function( reducedDims(sce_obj) <- NULL } + # Pseudobulk checkpoint + pseudobulk_sample <- c("sample_", "cell_type_harmonised") + if (isTRUE(pseudobulk)) { + assert( + all(pseudobulk_sample %in% (colData(sce_obj) |> colnames()) ), + "Character in pseudobulk_sample must contain sample_ and cell_type_harmonised columns + in the SingleCellExperiment column metadata") + + assert(c(pseudobulk_sample, "file_id") %in% (names(metadata_tbl)) |> all() , + "SingleCellExperiment metadata must at least contain sample_, cell_type_harmonised, + file_id for pseudobulk generation" + ) } + # Create original and cpm folders in the cache directory if not exist if (!dir.exists(original_dir)) { cache_dir |> file.path("original") |> dir.create(recursive = TRUE) @@ -114,9 +132,28 @@ import_one_sce <- function( # check metadata sample file ID match the count file ID in cache directory all(metadata_tbl |> pull(.data$file_id_db) %in% dir(original_dir)) |> - assert("The filename for count assay, which matches the file_id_db column in the metadata, already exists in the cache directory.") + assert("The filename for count assay, which matches the file_id_db column in + the metadata, already exists in the cache directory.") # convert metadata_tbl to parquet if above checkpoints pass arrow::write_parquet(metadata_tbl, file.path(cache_dir, "metadata.parquet")) + + # generate pseudobulk + if (isTRUE(pseudobulk)) { + file_id <- metadata_tbl$file_id |> unique() |> as.character() + + cli_alert_info("Generating pseudobulk from {file_id}. ") + create_pseudobulk <- sce_obj |> aggregate_cells(c(sample_, cell_type_harmonised)) + + if (!dir.exists(file.path(cache_dir, "pseudobulk/original"))) { + cache_dir |> file.path("pseudobulk/original") |> dir.create(recursive = TRUE) + } + + pseudobulk_path <- file.path(cache_dir, "pseudobulk/original", basename(file_id)) + saveHDF5SummarizedExperiment(create_pseudobulk, pseudobulk_path ) + cli_alert_info("pseudobulk are generated in {.path {pseudobulk_path}}. ") + } + } + diff --git a/data/sample_sce_obj.rda b/data/sample_sce_obj.rda index e2f4f8f76b8632982d90014f7307a9838866c193..bcd5744eda0bab0f3297a503b10e9c9a0155639c 100644 GIT binary patch literal 8328 zcmV;3Aa~#WH+ooF0004LBHlIv03iV!0000G&sfah(rF+|T>vQ&2UKVgRpfkl* zy7}J8y%(LSVlrVEGvG*o*df4odNVN+Z_j~QUlPNy3TxNk=Q%hy#!g-vax-z*=3>kSaxM`i@G5y za={Il_MVDs!#I9!KTrWe>aSJcv(WZBu@5OGUSUP|MyYgddS+}|69{L2ZnX&q*dS95UcTx zFojT;@5P+)imLekN>dJjc(q{v+?tX`Jcsp4J$!I1>C~;7@XCjQ)h+y8%DJxEf%$`# z`I1v;Llx9f7k;rJ*18lUspgC><8+_gvglDSk#5m^v>Fd~X>GeFo2Ift;^GY%bVsh! z+MP`flhx;27UNRilgun`V;M@%3Sext3cq@f2lvEJYagi6|6~f3$KWoclk^2#);qP- z@AFbvuB2M*j9t*Ft1yt1_&+~tEvx`fC*^>Pv5gG)BeA?B@ zzNQNk~LDN*Nez^>JX+0@^eJd^~`~A&T-12SA9trs z2ZNVgiGe9<`}I+zG>|dckv#uZIeq%hLfdB$bv$EX-p*3W>?$@MY(RXjQyqC~nR=rT z;dY11lNYo3c#{Qe{&tDj=YmXQ%L@Sl6)77Qj;b z0NrwO^VyfXgfN7hX+`z~;vlOBxmarKQN|p8M`HwL$)+tf1}t0P`gt85=N$v25QLsR zbkiHwUi$%Ai|!u$p`YIeNZ-O(sRdsMGo}%l;JoR<(3Zo zF`ygtOC+qhCZd=;=d@wbr3o2jL$V3)fnw*rA@^Gz6zU{emYno8-B3d3liiJQ_DGf2 znE?&IWf1#@gQYXN5?r2SM&w`{p@)! z^?UcmurutW29RH70FeIFijMow7$EK645n_`A}hY*@bPlQhW}!pcALQfjB1bp z*Xf<0%Bl7RkMTH)$e7=*<9NFp6*zI;AL|rG@!s_xO;@b}r`+_4%t)UgBoDMQ~q(Zc9IlZ1I*KnS)Y%%=` z(iic^en{(O;lfk2i{Z89G?a0ua*|b(>Xc1xibNgC_Pd901*-M?hcl{=U7)OOC4?33 zfy33$gG-X(Whyokry(WQApUSRH>_P#wg^I_y@|av0hRak)&mo!2&H7~eZo1_K2hgg zQc5-x$K`!bly0;^4UgYT;EN)_w<2UfL!jp01MhN zkyGc>jae3mTnT42z5Mwx*{NxVJPC#^Iq9`gIiKi6+lW##J~N{aCEsVoKJLOOM*zCB z?<|<<@fBYkL#gP1?(QNwQ2!8|np{)$%~pfGZhb5s7&c^>a-gipLbOY^1dnkjSl_=z zNHnpSU^W0nRgtSME76H}zQgjPEr`_OLLre1yYin^a~*x(ArZcS+jAFlJN$NAbva3% z)tjPJw(+irseqPD^)6WnzKO)33M=5{dVzq#tsl$Oxqlzld$X$(UUj2=%HcJ*s+emZ zjV(LJIlEUl(mg6lNxRXHdR`EuA=uqG-o%NQnEdEn4_l7@S{Z>q-fffy2^#SpG2rg&6*jggGIX6qv20apBA=S!%P8wO zY??+e{u#n!WoTk`}f-bY#7``|klVp#Q< zP(r>oZ@=NQukh$Ux>V%`-zjf!{gADDf@_`_EKnNYbIIuI2OWK- zcWJ=mLw(hR7lPut;RR99&`7d|Fs_6hh&nxS`Y^OuH(Ha46fKuA-^1N{Py8zbcN0J$ z93^v$`!WBHRzI;@USCJU6UM<3QPy{d8=;{JKvWIoKmgF0=OC=E^g~!fI6yNItQ?KC z7@spRwlsHYnZ{d@$J=d4&-zMaEDp)6gjK)*$scwC?~iOUx_=T5sk>II~Kam><20orW{K5U_h<|>|bBANU~UrjPszP@fNE2cW5Km zj7Lga72y@{xpEa!tI;=0%-j0cR32Trk(YRZ0sydXTM(5yHLnQ<>PsB_(TWeK^f5v; zPLFRLO~gaxz9NAGE}bDW5obe4|p;{4zIm`J*g<=s5G`qLL`9`S3e z#IFz22>F9$44EK;sV41Z@6kti`Yu$M7wb1D$opP4#=DH!k7lyWlL4%n4WZZ7>@?$l zLn5sgc3T);%5@Fu`v50&2+xfsYpYhKw0jT`zWSrUr3EFJz6Upe4?sg-)LgtrtB*4a zx$o$u8juJ0kYRWtYxm_58g$u4l{ayLT7bv=h?jqY@pyG>7lXr#^fB9Bb-~b&iDf7L z-$BZZ^!(0$Lgj35XYYZ?7Il1AdxCC@bK=NwXrEbw_I}UYw*mJmVqS5NYmc0nG3#iq6S7OEj6#Y8;${26d zieJyG`*)G&nN@+lj^9OxlT-F8jgE)bj5p{?E<1(mcJ7{TceQm(d=;&M07{y*E5YTu z#z?6FdF07;eTQ75;Ahn}@&^P(dK9S->XNp)552KKr91eS@~VIFOwYk`Zkp!iMA+Uh zS-LVEsn3N=YqxrEK?$bsPU z{uIec5sg2q0&T=-EY1t5%9CJ20!waGsOH-4YGbg-Pb9R;cQ5Js2a?;S*Qh=XS=j)F zD^2NRuBW>h2VA&Q?LzVYwYY$R-a)TO_(Z^B~t_Dw}76FLRmCg8Fgge z&Or?|>VBr{!V317N^a)-8%xvyBaZ8U@ZtcrsSjnCy@ei7Q@GeJQ#;YGo(yW>XV*EY z{sR)b(;;oNsBg1AEX zc%qxU97keXM!j>8X1)!O5mY}GXdjW-@Xj2lh`H{ZJxf zGj4swI*IDrZFriA$a}6UIY%j}f>8M#GM&%u@;uAV-P6pl+CSWNIrfQ^imcxMc;R`Ki(3Vsrv| zrj6}+YPWqEK!Vwru$bBs*~PxR?}||o(bw)^FBqo!@Z0qeLj|RumKr>IaRvoiDJFMN z0+9&*~akql*s2GG;`A{wddGS->_}EL+qCqU@qUA6NF}9pDUh z0a#|70ef+q8lv%PefpTjN|e!+9>mxapC5tsy?9m!nS5_wS0o)-e#asa&oUuHI*!EL z%`tE+;OIP>_Q0P$gUsk?Ue7Io-_MGxsxCSfNbiq~rL)M)OzNHMU}Kj+Zu!`s;sR%B zCFL+gaysaip&J2cQnb|*05K8=oR**&Osb6+!zY{0l**fEc-f?Y6N~X*_*?(GA37Ti zA1l4mXL;AeCPqt8mrJPsW^zT<5`H#=N#0g7Xd(UxhAn8`PS6dne>F*(KAU>)NJ=oi zmd2}I3b$8ZnoxGOni?I*#}><_+}9KDh^7fDyC1i$F&QU(7BB_HaL6wqD^P4SK>Wq+r<11M6kh+q&m{4! zStwHre#2?M5;$r{CcYv&MPv-4c6_!V2f+5j()yBmY&l~iqLVP&@bK4OMTY$}=sT(T zT$e%GLvc*^wBEWFN=KQq`(Xt4#D%xD4_nZ!O#5JZ`^9KPEwc*H-GhLf|Fyj^Jijb zQcRF8wo^=E;ujT=k4(PXKd1EbhBo2YY!e|At^h2-$VeqfmWpEvlxE}|JlS12_}=XA zOu)E+LS+4tWNKhm4g~!Q51hjn;dg-EB46I58T`dzp)Ii_2>=439S*w8l3O1U7ujiDmsrondP2Dk#o*ZnQ6ap+ySDKIV3rx z5B96MddG27!U4q5>bA5S{FwiA@?jDgOF-zWTeb(u$^OBP*XXXNHR4akS#s%TOiE}z zgRAnKZklNeV_gu2l8$v$KD;~cxf#z_4kY}1w`@T|P7^6K3+OaXD;)`pXK`zHbkZJj&WoXDp&@=un!SXyUGitQC z*I(BP$y5`$WpI#Hq%A2IQF_=uY~dB?Qb@l#dykC^J9E@90!iT`a+AbW{uS!u)%MXT zz+K6s-224f~1co`_;>1dF`!2^~TRrm1n)F z`~dRQ=~JB7v55>KER2+z^AXAoh5AYa3-qCb>Qd}Fy8dT0IvvF^^1qsl?y4^Aw7gglR%9P z$2$hx-4)J^^v|+dW~csVD~Hs@l2LnHx_&^dA-b!1biAP8KIU^R=;kk9$yynSI=bo zp)LeFrF7NW4k__Y7sv6ws)u}h_Rb!kDw8q2|A4Py+JLbGwk9@uI*0f9`Q(EjZ8KdrvC@-Y1Bhy$m?` z%zE)$dmk;NH*-}V5@&qebkxqeH|f%wfMp8hRE((BpGd0&Mws~~sTXl@^rUN!JNNG1 zp#=@sSS2oKpEap!5~0c)INgB8dx0Y+rN0Qwi9ciakWu#JZ%dPCJAs|iv}(f4gGSwl zP&$q_OV#D6P+lp$wTCM59vtD}xWDObdha~@0z!s82zkv^aaqk0tbaK7v(Dqg3k1SW z1VvQ7CTr%jC*Yl-3`7x)_;Fd1Fq})Qz;|pnN^h4YP4$N^rK?~h9UJ+wSzyE|CPn1kiuDFk*R~4^H9TvvAXj}UG!!2lPi%vW`g05yPG1WmsR3bIg>zi_ba!<#IIUZ`r`6IaS}Q_OCHnew$;|**ImTM zmg9Vm2{y1wKRsrO0&|4smV!8KB2TGtk=$p)u7Gcjm&|sc$JH$r(EuMtjsYl)hQu{n z&Uv&5o{x$*)Hp<9@BLc9NT^-+2-=Jrn$BNOGZvSg%3IDs7#^Cs@LxXI@nL791y~h% zoc4M-ykUS`j;gjTQOGxKp_l9~^Ny&iExk28CQB$^{ ztSIf46^g1`5f4OO-J_H`J)jX7DWvURv0R8G?H?BbvD_g#@SmBM_Uj;sAOL~tHTpA~ z8G%~ba3&aS$68<>k^+yv)-9@2Qr-)@M`5`ASf#c?cA@R(>+x!Anv=d#K5Qq*O8OM) znA2Ea3;JtLbO>M3{pNVztk(oHveJBYQ<*RBBd2&RWon>jjNYkR>Xo{TQy!g@o&+Mx zAwRchx&+AEKvGCieR|lO%T7Rwtx(R!Y{%t%?YWZtDh+*V$ml(Az1CsC z6iC*LOp&cl9&$HFB;Odm=!i{KX$xXDY<_jn@_RjBnk#S;hb7PxMHToeJl6*tH`16I z+c>t`iIbeg$+RZ{S&}t^%TGx#4|$;{QxzJlxkNo%T_UJ@HFkNMVToK)KJf=*lMm8e zSb`;DvmtiLbER{yesXeHB6lAki_3)H5N5$wvY|jBS%S=}kjs7!j<5B6Ay-I}>h8;T zgYZz}XF*1ye4&Q+Z7Ynq3I(ZH6?f`W8;*QzrikGpkOo(;9~ zf3{=SrwUpK(l=$52v+eHihtet+?rpPKTGJgAOz4YZO?KC-F<;wpz&Ibdiw52IyPpR z*1m=WAn`KPaK_0VjR53c+iZA5TP5NQwuZ-{pn?VR<@QsOD+VJ=sqXW2 zUP!ej9?vJeQr#<$HuO>xr8M!-0l2T>54dh)E#ZNhRMElzl=u3bY^7>x<_r2p0C)j- zTWD5q$xv+sDLN`DPmmVYmdOz6DD9CYYa1a04@o|K>1y5~E(`ajDH7z7C9f)i#Ke!9 zW~h06D%Tt9i*e;yDj6G26aLUc44>xdV`NvHd{y7U@_W`q6+fMkz5C8ps-tnEm$?g% z1XL)5-<#%tp(8NVGTe&C=GGQkI~gx}M#au}_n%zVP~&A@XJ7}`_og1#&$e~3h+8A* zD{oY}Zh@);@4N__DFI{6BjxYVC)%9to72fZ$92Mt-4nP?2ibw?>fZQgB;}Z}f0WkV z@7$pDJjV#bYIk$y~o$`MWty^3c?mlG8CL~k@O+P*`Y zf=+|~M=|K@beQt+>92v+_8(5hA;IUl^K=&dYMW$oDkL8dO$W;t<)v+x8D(Z>ezilYf**n7=L$kDx_cbGYxSsn2462_a6uP4taci#T`Rj;#iI z9E_Q(69D%igLY5d*|1``V>qTzI?6fZFS`JgM;}{$ZnPPxPlRI>>qti7dgE~!b1L&B z*=Co2o!3YhM0nQG#WPC9hy&(7G9P^8j=*?`OUNC&XJ!xLXioJR-M+N9?WBlW9`7or zU8;*%dOqBf+AZ{Kav!O~JR65!XhM^5w@qmi368OYc9Jr5=@A6sO8|*rcj-PLO3tx z7P(Z&Ge7BXTx}lRLL&x|*P7NVYNV&1ZME3?n`LM_Z{i#eispAY^3*aUG$vYhlU;yz zD{cA`X;^F(@RN0002T$`?KW0pmdGqyqq( S;^;>{Fb#_W000000a;qUPCilq literal 8212 zcmV+vAnV`#H+ooF0004LBHlIv03iV!0000G&sfah!_*(xT>vQ&2UKVgRpfklJ zX`5N??lWQ-aN$^%=S23Cid~p$T44^ftNjEkQMKSP%uswRY+-2OdUyGqO0of!Jxu*pp?9<(9FN}WIaPprmSKji4~8LAie z+9-$0Dtxi?I_704y{~sq<&d`|Vd!j5>N^r;=TKciy@Z%>p&6v^fN{;pq%(i}UtIMwYe3HJkr0kv_QCy5K=eVDudiD(J-f5f4>+H+{`nC1}a7F`z7goI`oviBTmsF%6Nq8V7rT&V0{$;Ng8m~N3 z4PV>@dko=`w+7@3(cvWue;kGnZ+VxAxiyM@o?&URg&=L~P*DhkZF5Ikx={g)$Xyc) z-;=;|L>uuammOiRv)IiHy6aiMF2S*dh_I?KVSda529~IY% zPuYJT%^KIMeiRdR-AlCCLhz!>w0qUbhfR5@9k1XQ``l@q15fqA?~aOhX)#{c@)w!y zq0~ADE&P%ClNh$!OeKI*&G}@1-wRvg+zlx{nwQH-yovkvr|Ia{rf@FMfe%6HYE#E} z3{oCknq&G{DR@;3C|X~TL6e7gA(vH!{SM^$h4@m8SB8yj5~x#BZ&JCmX+kI>b*tHGGAavh{=B z=f7jPJMx2x)-(eB5#N3%k&!qq2p{C+^ku)DRO4hw;fC8MnuV_Vc+6;(Gii?MSmL-k z0iU+f4M{En2t7Fu{PKhGW7RiqicC`Js6z5`m}`tMI+Od8U)5UCLKRR}sn|4}LvbrpYwp^AJ! z(66m0R5+SSiU^j8?%}}1@@OMnPKs^2n3%i!)I};+nG%duegWAMZ@SVG2lJ{~e#(Ai z*_yPxpPDOot~rbW?F-3rDP+Y+ajoi=fi|yg=X4G$$kk)-RzYmxmDGaIX0K!G#hm{F z;2MJ@u%zkh+!8$s)}6$o_F#hv%pv1qgrJq+2bp{@Gf_%itLfENbe{+@d%O0z-KYt~ zQQygf(FpxmpvIOE8N8T2Y`Ye~lvjUqfGR(gzq5JQ-^#B`bBS5H@oHG%hv8RDWS!jSgq57I_gGzK!?f~x>4N93S#!0t$0FLZNioFu3cSzkdPAt&S7iHrM zoy73!B(idx+M_S-RE0C?V-So*R@y_1y7V7L(ed0yDBG1t>Qh0X?>PZ}PX5u71TusJ zN(>sTuLY+QV$OoZnlOs-&G}zb$!V${WKAY<&)CcRi(9Fg7gmnST96X^g!*?*pCk^) zWS?blf_qt%IGEQdHGX_{h#vtH&lABU6P1zi{m+41^`p)(^DfbW-fYkIaVyJ ze62SgiFPLUH3{`pTKWui0-g>=zo^BeJJTBIw!_da!f4vu^A}jOo3%(IFP5J+^FJPn z8J}9h?^tX{wz|$vQ;dB|*Vj*m{I|rR7q;H$yg~Dil?k6UmwMb?tW)@OR4Hs|)jDcY z8kHMpyzOgdRgd4LHh6!1ewcWiz;Mtr|7s09NuNRq5B`Ad{>?D_)ome!(GKuOS`}_% zBZNeXM>pk>hph9XF3)`aOD^`SRv0c{JVO?gn4c-LU^}EiX_? zeO<@HguctM3u!?!c;OQ5ph{?(ZXgx*~_C|AnSpNZdaMw?vX0K5+so7nQ-=*XvM|zy4SWE z;OO%_k=pJ}^8zKtF9mK5C1r-@A3@sS0LG2xY5ktK5zcurR(@jbhYfKiGa1l;CEn^y zNdT~KMO1J!aOO_d4KawwgWj|mE7>j^L4FpEa(G76$uT}Lb--O8{@QGXI6NObVj@8yzG;E>(=O z@%yxY%x=vpYw(GXs8Tf4Abg@2+X$^+9m04x2^eSY(?Bze5JP4XUXw;NF=HYscs+FJ zw+9PK6z^e*9@#yZBgFR1M}~Uh z=t%6*obUQe87$rqhNfl~#~(lkEuTh$fw-70}J@DlLMK z>vcGHgFbYiPzy=wgD0_Z0~|P5S;QdJP&^kE4INvrJ|;n9*m+dLR+PyEdTAH%qcW7YQQmp!I=Qj zIcu~RHJ`Ll7z7?r^&tlOZow&c|3DK~`hjb<){BzLKvwG9g%>?Yn)?UH6@8?6{^0E3 zHp7t_=GaifWN!q#!&ux! z>KpKEo_y8^3sNJaUI(+}r@rl8zQJ|~(AC)d#$<-s+4?O5I=GvsqW*REA^`3xSM&<3 zOjT%NQE3pV9`{`a-hJ!Ta!TyK;@P$A~@<$?>uC}{a=ak0m z4KmNB#L*SK0sdpDg0p+e!OH8?8(q>fNmin^^sb--@B_7mJGKmdz4zA_Y3!=OcRp# z;@C-FTq*8rN91lGwSV+hMj@t-Gz2d=-aX_|K1=oqXAI^^N=KO1>-Q3QqkfT$I>@Jh zYElT9snG7N<$0gM7y|vJWjvk`KxEm}Z;Tv2f`&r&vAMvCjo9RfMnaSI2_m#0!!M2W zNT(}DDHG%`SRE*&Wo6yvj?V7RWC6QR`q%7r+fG46cv1i^^&|$EGqZ9)jVv#mK7%aWD0=-xk8idi((1lVAv3)&kj%$4R0PB#=DfRSO3B|3=e%A z^_?;Rs~1O)axR~mG*4)JIvl>x?$TRIdO^i?x-LOO0+#(lSh(tTH`Vp3ofBE}ePdR7 zQ45wN!*7`AjUdp5|EHU+A&})q`Y&3Mlr+VjoDJ`s;rRJMah$m@Lg5hp<~)js9=v0= z*d{#R(th`kpeQ!1mM1n1i>;?mC1vr5c9-N+cl0p zNy$H%Xti=!7gCw>Fq{YJm2c)^^Akl|tdPr{N=A^7RQX?sv^cK5PPMR2J)R;{WT(d$ zv~>XhZnBlmSJ?g1Fi=E`xjLBiril(;oC;)&okTm(ohQh;430Tii4q;E-l5FfP35P0 z`Qh=7NysTy5qYm2`NAwH>VT)LKC!K$v(ip@>clI>5m?lJK4|tjV1^QngiWtO80Prg zY1pf%NGO<0YgcL__ME!d%B}JC1+d|DZKMm-rlJL zW%){_wIZwDIb@;Zru>iElQ(Bv-x4{MDuOxgO0tUM9mlMGCpD~Xg8yniWwxTjZaavn z<*^>YVh!=Km)Q?N%eZ7h$!kp7&aNiZZ^q{CI)RtniKdn^{J0u=N_y+XgsefutF7wS zNd)WLl4&FG83mLBsKjYK7PQUALTC=5976ortj_g0*v+Gw? zW_N_5e;s|?dJQH3?7Q*h(+-~ov~MJ(DLT&sI@VrAe^xlj6PuE1;3@J^tvc)K0A-H& z#D9(vJd~Y*1L05#Vh!~4J(tKd{V~Ym-d7q# zg2|$IW4K!(IJROxvbz0-k_qyOOZ(l?^w^ukDKEP$ufwlE#WxdFg9g@HcQZU~qec_w zj=U_2U@~}q$z`Hg{yjo6nW9XmHOBfmHv{uiR0Xq>4?i&O{+)?YpzX9DXXV@Xlh9|o zhIMm8;o3Gby8`$HW5|^M@v=Z*-+%URO*C`+J^&Uu!0c?f@4VLzUcKy8|0hG1tq0q1 zc(_h5u!qi9H87{d)hs_NCDhMK@?>2L!K_CMWolehMy&#Ad$JT7nGNqFGYCv*?umKA z@#3o)CtH!oTX&8r!6~{@AJyE)^V&dS&!iL-cAHj>6yCy~Ma#`V2N&lR`{+XPq$1RR2oWjumk{DiRlQT5}vue*-J#JVJyOpJStUx&@hoVHo;yH>d-f zYldVRQWzoGsHk?)AuUiME36M%)mzV@BDo^k9T_Sy#xc)q|2+UoZuH3G=+9oP8>kDk zUS*k;U~pD#s`!2<1N@{|b`QB%N|&Zr^$A1sYtBwviw>%quYnFYGdN7CKgaXki-o+z z0(R6&^6E7d0w^TC$qdAC~Z^WuR2=e_P>D)}8{`iNM1{pgc*GlJOR3(Z=Nhq&)tb8nFIEbZ8b z+-FBS1I1%rJsAoI({y7SSsf+wW7q+R_E(<#BJm#Z+r8b z{?9?yL5)L!wy(mPT-H+yV*&hX7nr4refG$R_YzWBJG(ln@md4BWp%NKIMXtnw|Z@C zj*~KG2=DNtrOO%`x7s8Vz;z4_a}hyx>3?FO{y-fID9$} zSMCmG9*$t4V#?vQl^Kdh$Za>9Y^~T}6#fnwbKna&SpiFtst1!`L(YAK58%3i9Ze>Z z0^&-|g0Y&S#CF5vrzl0<5>EWeBy>7CttCL=p)dJx*6kqjtQ#ahSC!RTTYk)_9guK( zx{@qkZe?*)|3^Q$T+v`oU*ULH*c6YiwV&FJRQMuRWx%A@s=JyV zmGFfRZd(K!iv0B8)iMV-ZS-E}?rW=q*&P+Mgqd(w30$TZ%byUZso;G-MBW6uk}zvS zNq8_))QfpIT{ELppU~i`3D>j(J#8_ww1zaR!FRpUG($n5>6vDUBFvf~Pqq7aT+*Cs zueTY|Nx>GPxD1$;s8E_5&v+>FI(B#ipbYq97t`#~_!)UWGtBAo!Z|JBVP>s{8$oLW zUQo#rTjVHRgumFmhE4yNNV(>NQ>m={)um4r;n%2gyqplL;^vYCi+1K>RO;$KeC0XJ zyj=Qy>a7zXuDGv=uhIJ^+>G9RmuCBB zZ-*BI2P8HOrEN`IX#o)AG^iT#*r&}Ktk}sm(xWnVG^N^EWD?Y&2oE`4o_zruAf}L- z^vLG|Wj2OCRc0`+Xjik;rW-x1enR&@-->tVO{lQegA~yb9-ivHGyG#h&hRMCC!4yJ z7?fTN$a!kiQ0!}5BHJpMO>FbYQi%FsvdCTQg{- zjCyVlUsl*4{O$)AFgIxJs5#C6oEEYpa>71w<;5&T?U!Y0L)~2q2BhH?hM34pz*`ge zb^JcxyPm;%yS?gpXxT)yvfn4FsNw0_O?WZZ1K(WW)y3xy?0T4l1vZD}Bj6Pdi;!zAfuT@co zl>cA=CoLpK5I)X}RUQvRXevtj!mrXI-ZUP>sz2`2zIG&>r%acg3HBS``#>l4gP26C zg^OP)u>=UaHb%g4K4^w+zXzf+fq>xu zN+EXpqbU!@23mGn?p$7~)KDOCCJKL*4UG1nqAAiFVOJs)B2e!~3BZW~LzQRiV9nHtme0DV6a zWV+O-=>7MfOUu<}jQdwPDdC)y8v9UYcN7&0bX_>pDl<~e$VE-NCAOY|@qv`8tO;U{ zBO~$m&{^Y6>^b@v6bT{NReI~t4bb0yu1N@;bBSzxZ-3|K)xR~Ee9X%Ewoc6P>t?%3 zENGhWl5-*P%?wtBJ0ZXboFJ=q>R0nQ$3^)=R7L*UsH?u7=s&)vL|&AmXPQY^q~lV9 zAy>_#I6;TvTPNEuWiR^sHbmecGR<{%o;f)D4dP7w7j2pAL;W*qoOjD10{tr})+g5}L8KhoQDnOtP&-aUET;#2ZgCN zpjl_5LN1tJ`*HL{!;F3>M>}qyV7Jus*qxlaF}$l{eT0}@eMox-ms4pL*cHe-#Q%k% zZGzH>_McmWR*DmY8aE6#Wtl!mXfp`d~)|o&-rkMPMu>&PIBjpOWGc8 zIfttwq&TP`gmVOvPFXwh!dqjZU*~FNCeX36t|CPJ7_Fee5olVwsEhm=`9F}}x2X?* z;mr)C{wvKyaY|`h0mVFyZL_7A^(OptJK`*jOTPyuQqe~I0X8I*Ry-WRr;_3!jFM5+ zA|J>bNN0SK6CN*%9`JI2SH=>_t9Vt!cUDB(ZtnKZ7v@C)R+^ZO_3ompu`#?o^ z!ODszSGLDW5z_7;UH{M4ZfE)uptmMoL-G6pj~z(+ayrpb%8Bx73H%hc4tuH!vg~zu z#LY!xUr^0vVj9ysrrK^Xo~dCHe1>lQ=Uwt6N;_5)9UHQOXT5Cx+WZ5;Jxl%@ru|V* zN=x}<^I2vww{{>~XHyK2Qh4#SW0LQ}xp{ZAigy54V%rN22%Ti-PAPa0m|bf+%GA*F zdu71aB=05Jm#Btj80hx#8IItNjS1p_?biKtoDICM1m|1!P(a4`;GveA5&XEyxd72D zLMtthijHu%Z|9~rGh2Keax7kpqbI7@VC3S&3&}#I*Lr*X(@D>D*e4mQBTh_l zKeZpbSH){L4Qk_p#h>w4>m1xcs+-XI`c!Ge+y~{XRdEjw1QLi>b+kceAqGpwkK)#f zoKAIq?Ovjyfz!vb_Yz{$s+XaBq>iYg;5;P+0QGg6pXKgKS7`J8Wf}!@8s~S62BX!C zv>>>tsPnCgKnSe-=r!#|FN8Y@fqh`uoO2`z3oF*`EEBR~L4dmHZn0q{T7hXVlq^=m6WFb#_W00000 G0a;p@jQ~FY diff --git a/man/import_one_sce.Rd b/man/import_one_sce.Rd index 09b2737..fa56544 100644 --- a/man/import_one_sce.Rd +++ b/man/import_one_sce.Rd @@ -7,7 +7,11 @@ \href{https://www.biorxiv.org/content/10.1101/2023.06.08.542671v3}{Mangiola et al.,2023} } \usage{ -import_one_sce(sce_obj, cache_dir = get_default_cache_dir()) +import_one_sce( + sce_obj, + cache_dir = get_default_cache_dir(), + pseudobulk = FALSE +) } \arguments{ \item{sce_obj}{A SingleCellExperiment object from RDS, the metadata slot of which @@ -16,6 +20,9 @@ must contain \code{cell_} and \code{dataset_id}} \item{cache_dir}{Optional character vector of length 1. A file path on your local system to a directory (not a file) that will be used to store \code{metadata.parquet}} + +\item{pseudobulk}{Optional character. Set to TRUE for generating and importing pseudobulk, +the metadata slot of which must contain \code{file_id}, \code{cell_type_harmonised} and \code{sample_}} } \value{ A metadata.parquet strip from the SingleCellExperiment object. diff --git a/tests/testthat/test-query.R b/tests/testthat/test-query.R index 4a2cb1b..295cd19 100755 --- a/tests/testthat/test-query.R +++ b/tests/testthat/test-query.R @@ -203,17 +203,27 @@ test_that("get_metadata() expect a unique cell_type `b` is present, which comes expect_true(n_cell > 0) }) -test_that("import_one_sce() loads metadata from a SingleCellExperiment object into a parquet file", { +test_that("import_one_sce() loads metadata from a SingleCellExperiment object into a parquet file and generates pseudobulk", { + # Test both functionalities together because if import them independently, + # the sample data will be loaded into the cache, which causes the second import to fail the unique file check data(sample_sce_obj) temp <- tempfile() dataset_id <- "GSE122999" import_one_sce(sce_obj = sample_sce_obj, - cache_dir = temp) + cache_dir = temp, + pseudobulk = TRUE) dataset_id %in% (get_metadata(cache_directory = temp) |> dplyr::distinct(dataset_id) |> dplyr::pull()) |> expect(failure_message = "The correct metadata was not created") + + sme <- get_metadata(cache_directory = temp) |> filter(file_id == "id1") |> + get_pseudobulk(cache_directory = file.path(temp, "pseudobulk")) + sme |> + row.names() |> + length() |> + expect_gt(1) }) test_that("get_pseudobulk() syncs appropriate files", { @@ -253,4 +263,3 @@ test_that("get_pseudobulk() syncs appropriate fixed file", { expect_gt(1) }) - From 923de59b479480366da2b0f08fe997cf9f0156ce Mon Sep 17 00:00:00 2001 From: myushen Date: Wed, 24 Jul 2024 13:22:31 +1000 Subject: [PATCH 3/5] quantile normalised pseudobulk counts --- DESCRIPTION | 3 ++- NAMESPACE | 1 + R/import_metadata_and_counts.R | 33 +++++++++++++++++++++++++++------ data/sample_sce_obj.rda | Bin 8328 -> 8332 bytes 4 files changed, 30 insertions(+), 7 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 022d2fd..c07dfe4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -96,7 +96,8 @@ Imports: stringr, checkmate, stringfish, - tidySingleCellExperiment + tidySingleCellExperiment, + tidybulk Suggests: zellkonverter, rmarkdown, diff --git a/NAMESPACE b/NAMESPACE index b0009d4..1179d95 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -84,6 +84,7 @@ importFrom(stringr,str_remove_all) importFrom(stringr,str_replace_all) importFrom(tibble,column_to_rownames) importFrom(tidySingleCellExperiment,aggregate_cells) +importFrom(tidybulk,quantile_normalise_abundance) importFrom(tools,R_user_dir) importFrom(utils,head) importFrom(utils,packageName) diff --git a/R/import_metadata_and_counts.R b/R/import_metadata_and_counts.R index e85d846..079e533 100644 --- a/R/import_metadata_and_counts.R +++ b/R/import_metadata_and_counts.R @@ -20,6 +20,7 @@ #' @importFrom SummarizedExperiment assay #' @importFrom stringr str_detect #' @importFrom tidySingleCellExperiment aggregate_cells +#' @importFrom tidybulk quantile_normalise_abundance #' @examples #' data(sample_sce_obj) #' import_one_sce(sample_sce_obj, @@ -138,20 +139,40 @@ import_one_sce <- function( # convert metadata_tbl to parquet if above checkpoints pass arrow::write_parquet(metadata_tbl, file.path(cache_dir, "metadata.parquet")) - # generate pseudobulk + # generate pseudobulk counts and quantile_normalised counts if (isTRUE(pseudobulk)) { file_id <- metadata_tbl$file_id |> unique() |> as.character() - cli_alert_info("Generating pseudobulk from {file_id}. ") - create_pseudobulk <- sce_obj |> aggregate_cells(c(sample_, cell_type_harmonised)) + cli_alert_info("Generating pseudobulk counts from {file_id}. ") + pseudobulk_counts <- sce_obj |> aggregate_cells(c(sample_, cell_type_harmonised)) + + normalised_counts_best_distribution <- assay(pseudobulk_counts, "counts") |> as.matrix() |> + preprocessCore::normalize.quantiles.determine.target() + + normalised_counts <- pseudobulk_counts |> quantile_normalise_abundance( + method="preprocesscore_normalize_quantiles_use_target", + target_distribution = normalised_counts_best_distribution + ) + + assay(normalised_counts, "counts") <- NULL + names(assays(normalised_counts)) <- "quantile_normalised" if (!dir.exists(file.path(cache_dir, "pseudobulk/original"))) { cache_dir |> file.path("pseudobulk/original") |> dir.create(recursive = TRUE) } - pseudobulk_path <- file.path(cache_dir, "pseudobulk/original", basename(file_id)) - saveHDF5SummarizedExperiment(create_pseudobulk, pseudobulk_path ) - cli_alert_info("pseudobulk are generated in {.path {pseudobulk_path}}. ") + if (!dir.exists(file.path(cache_dir, "pseudobulk/quantile_normalised"))) { + cache_dir |> file.path("pseudobulk/quantile_normalised") |> dir.create(recursive = TRUE) + } + + path <- file.path(cache_dir, "pseudobulk") + pseudobulk_counts_path <- file.path(path, "original", basename(file_id)) + pseudobulk_qnorm_path <- file.path(path, "quantile_normalised", basename(file_id)) + + saveHDF5SummarizedExperiment(pseudobulk_counts, pseudobulk_counts_path ) + saveHDF5SummarizedExperiment(normalised_counts, pseudobulk_qnorm_path ) + + cli_alert_info("pseudobulk are generated in {.path {path}}. ") } } diff --git a/data/sample_sce_obj.rda b/data/sample_sce_obj.rda index bcd5744eda0bab0f3297a503b10e9c9a0155639c..e1b6198790a2a1e62615238e69d189b5ccd24ed2 100644 GIT binary patch delta 722 zcmV;@0xkWBL5x9=8Ut=1P_Z4c9e<)(rz7dd!PLX#IoFZf>@;D)-!lQ0@?9U!F+Eqm zmRRUMOm0a~FUGqemGF^aE~HIlc}I;-#OFwm;x^^v8|ul?UZ?C=a&e&ivYKR}K*N!@ z{pH1Qy*%0J`?LJI>*1h_MJJmCc(rB;zdY|d&F#3pYdk)ezAflew+*rmo`0{Xt=R{& z3LJ5jweg?#DAqm|xF&^ey*A}Bmo(W_Zn6RiX7Lw_MRAD$CJ%+G*+QP^$o%6|1f3wX zwl&&mrj3k-d?$Q?I)#w+i7_rTK4lPce7ce*MTo7IwwtQRBb7!duPSUrTQn5k#L&y} zZ4_&rtZl1hyk%d#c9>puD1Xm*0$yRvXOx52fn)Ae|0Dnv3IfrrXJEx?Ev8rYHEiax zBQt%*mwK}3&No9yfkL^^K$01-Ob<{X>WoWbyS7y(3!42F1?KZ%vIpX9C^ZkxE*FLW zz&SzQH#fF!E4f59$8Ew46DtKJAuI?kF*PTi-$M3Axz&Vq4JUhBzJJhlLTQ|^E0rg# zB}SF%b-tZj`<+8p^md6*$_=pXg=H&)Bz0UFQ>dJoM*x zeds~0Gxb#bRk={cb$^%)5`2R`-fMA!ms?RNX_)h)(o5*bM~Wf>7TO0?lJ5Ir4vRfe2wt9!_fd6$bJXb)6@*q0eICL0x91QA(8zCh zOX`fF!=z0c4K8Bs!bJ6f`9#~-&v`9oMfwLfL4|7|P_iEeFGH#XUBnUHndNbDnkfNe%_HUS&?nlQ z?VHodKgV^#jNKEsO$XV5>FVD2XC&pAuz!@+-tXL?^gPE1!{m)jx`D8c$$$N(A3gVX zg&qFi*;$c(On}M}P7S?^XGfP46!%1LG%(t}Lz;q4ga1b{=<9Tt^6=@efz|dOPR1d@ z=ehHA7X50QWO6Db9}i6j%PE!8O3J!HW)1$PQL4Zex{Ni(W(chDCb3b$AJ<`Qka*&o zhq$5Dmt%bNE3Z4ThrkX5w|^V~X8bV53(T%qY{@3F+Y40dW@psf_77oisc`p`f0Rg= zzb+4tpha7AxbLc|&tiWGAx^(d^o>@FICtNUtp<7=jG3zw0QVt-c2C{euwuAlIHpiK z$~ojOy8x6&A6tHIv>B;Sgku!zNJim$<8c{tD)S`SW|x1R*GL#dcz@Q>#WPC9hy&(7 zG9P^8j=*?`OUNC&XJ!xLXioJR-M+N9?WBlW9`7orU8;*%dOqBf+AZ{Kav!O~JR65! zXhM^5w@qmi368OYc9Jr5=@AtpKZ0+`kQ5FJ8$9~4~ph@Ir7vpBs3;kc9UIzb}McA662TzDbyFY3_849t19D; zXQpNF9eCd(Tn-_1)TdQ$8WTW97dPxL5o=Yb&%B={qfs)U;Me_| z62jLBAdjh=X|Z+TxRWs`B$)sJ0LRJ~J^%sZK Date: Wed, 24 Jul 2024 13:31:32 +1000 Subject: [PATCH 4/5] assay name --- R/import_metadata_and_counts.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/import_metadata_and_counts.R b/R/import_metadata_and_counts.R index 079e533..a426791 100644 --- a/R/import_metadata_and_counts.R +++ b/R/import_metadata_and_counts.R @@ -146,7 +146,8 @@ import_one_sce <- function( cli_alert_info("Generating pseudobulk counts from {file_id}. ") pseudobulk_counts <- sce_obj |> aggregate_cells(c(sample_, cell_type_harmonised)) - normalised_counts_best_distribution <- assay(pseudobulk_counts, "counts") |> as.matrix() |> + assay_name <- pseudobulk_counts |> assays() |> names() + normalised_counts_best_distribution <- assay(pseudobulk_counts, assay_name) |> as.matrix() |> preprocessCore::normalize.quantiles.determine.target() normalised_counts <- pseudobulk_counts |> quantile_normalise_abundance( @@ -154,7 +155,7 @@ import_one_sce <- function( target_distribution = normalised_counts_best_distribution ) - assay(normalised_counts, "counts") <- NULL + assay(normalised_counts, assay_name) <- NULL names(assays(normalised_counts)) <- "quantile_normalised" if (!dir.exists(file.path(cache_dir, "pseudobulk/original"))) { From 156d9d16fb0d378584f0f98701fc09c0c7827157 Mon Sep 17 00:00:00 2001 From: myushen Date: Thu, 24 Oct 2024 11:10:41 +1100 Subject: [PATCH 5/5] update cli message --- DESCRIPTION | 2 +- NAMESPACE | 2 + R/counts_per_million.R | 4 +- R/import_metadata_and_counts.R | 123 ++++++++++++++++++++------------- 4 files changed, 80 insertions(+), 51 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c07dfe4..1261200 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -127,7 +127,7 @@ biocViews: Transcription, Transcriptomics Encoding: UTF-8 -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 LazyDataCompression: xz URL: https://github.com/stemangiola/CuratedAtlasQueryR BugReports: https://github.com/stemangiola/CuratedAtlasQueryR/issues diff --git a/NAMESPACE b/NAMESPACE index 1179d95..5f2402f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,7 @@ S3method(as.sparse,DelayedMatrix) export(SAMPLE_DATABASE_URL) +export(calculate_pseudobulk) export(get_SingleCellExperiment) export(get_database_url) export(get_default_cache_dir) @@ -26,6 +27,7 @@ importFrom(SingleCellExperiment,"reducedDims<-") importFrom(SingleCellExperiment,SingleCellExperiment) importFrom(SingleCellExperiment,reducedDims) importFrom(SingleCellExperiment,rowData) +importFrom(SummarizedExperiment,"assay<-") importFrom(SummarizedExperiment,"assayNames<-") importFrom(SummarizedExperiment,"assays<-") importFrom(SummarizedExperiment,"colData<-") diff --git a/R/counts_per_million.R b/R/counts_per_million.R index 6a88217..a189a81 100644 --- a/R/counts_per_million.R +++ b/R/counts_per_million.R @@ -12,7 +12,7 @@ get_counts_per_million <- function(input_sce_obj, output_dir, hd5_file_dir) { # Save SCE to the cache directory counts folder - input_sce_obj |> saveHDF5SummarizedExperiment(hd5_file_dir) + input_sce_obj |> saveHDF5SummarizedExperiment(hd5_file_dir, replace=TRUE) data <- input_sce_obj @@ -39,6 +39,6 @@ get_counts_per_million <- function(input_sce_obj, output_dir, hd5_file_dir) { # Check if there is a memory issue assays(sce) <- assays(sce) |> map(DelayedArray::realize) - sce |> saveHDF5SummarizedExperiment(output_dir) + sce |> saveHDF5SummarizedExperiment(output_dir, replace = TRUE) } diff --git a/R/import_metadata_and_counts.R b/R/import_metadata_and_counts.R index a426791..b4eec8d 100644 --- a/R/import_metadata_and_counts.R +++ b/R/import_metadata_and_counts.R @@ -13,14 +13,13 @@ #' Directories store counts and counts per million in the provided cache directory. #' @importFrom checkmate check_true check_character check_subset assert #' @importFrom dplyr select distinct pull -#' @importFrom cli cli_alert_info +#' @importFrom cli cli_alert_info cli_alert_warning #' @importFrom rlang .data #' @importFrom SingleCellExperiment reducedDims rowData reducedDims<- #' @importFrom S4Vectors metadata metadata<- -#' @importFrom SummarizedExperiment assay +#' @importFrom SummarizedExperiment assay assay<- #' @importFrom stringr str_detect -#' @importFrom tidySingleCellExperiment aggregate_cells -#' @importFrom tidybulk quantile_normalise_abundance +#' @importFrom HDF5Array saveHDF5SummarizedExperiment #' @examples #' data(sample_sce_obj) #' import_one_sce(sample_sce_obj, @@ -74,8 +73,7 @@ import_one_sce <- function( if (isTRUE(pseudobulk)) { assert( all(pseudobulk_sample %in% (colData(sce_obj) |> colnames()) ), - "Character in pseudobulk_sample must contain sample_ and cell_type_harmonised columns - in the SingleCellExperiment column metadata") + "Sample_ and cell_type_harmonised columns must be in the SingleCellExperiment colData") assert(c(pseudobulk_sample, "file_id") %in% (names(metadata_tbl)) |> all() , "SingleCellExperiment metadata must at least contain sample_, cell_type_harmonised, @@ -92,10 +90,14 @@ import_one_sce <- function( } # Check whether count H5 directory has been generated - all(!file_id_db %in% dir(original_dir)) |> - check_true() |> - assert("The filename for count assay (file_id_db) already exists in the cache directory.") - + if (any(file_id_db %in% dir(original_dir))) { + cli_alert_warning( + single_line_str( + "Import API says: The filename for count assay (file_id_db) already exists in the cache directory. " + ) + ) + } + # Check the metadata contains cell_, file_id_db, sample_ with correct types check_true("cell_" %in% colnames(metadata_tbl)) check_true("file_id_db" %in% names(metadata_tbl)) @@ -107,9 +109,14 @@ import_one_sce <- function( # Check cell_ values are not duplicated when join with parquet cells <- select(get_metadata(cache_directory = cache_dir), .data$cell_) |> as_tibble() - (!any(metadata_tbl$cell_ %in% cells$cell_)) |> - assert("Cell names (cell_) should not clash with cells that already exist in the atlas.") - + if ((any(metadata_tbl$cell_ %in% cells$cell_))) + cli_alert_warning( + single_line_str( + "Import API says: + Cells in your SingleCellExperiment already exists in the atlas." + ) + ) + # Check age_days is either -99 or greater than 365 if (any(colnames(metadata_tbl) == "age_days")) { assert(all(metadata_tbl$age_days==-99 | metadata_tbl$age_days> 365), @@ -139,43 +146,63 @@ import_one_sce <- function( # convert metadata_tbl to parquet if above checkpoints pass arrow::write_parquet(metadata_tbl, file.path(cache_dir, "metadata.parquet")) - # generate pseudobulk counts and quantile_normalised counts - if (isTRUE(pseudobulk)) { - file_id <- metadata_tbl$file_id |> unique() |> as.character() - - cli_alert_info("Generating pseudobulk counts from {file_id}. ") - pseudobulk_counts <- sce_obj |> aggregate_cells(c(sample_, cell_type_harmonised)) - - assay_name <- pseudobulk_counts |> assays() |> names() - normalised_counts_best_distribution <- assay(pseudobulk_counts, assay_name) |> as.matrix() |> - preprocessCore::normalize.quantiles.determine.target() - - normalised_counts <- pseudobulk_counts |> quantile_normalise_abundance( - method="preprocesscore_normalize_quantiles_use_target", - target_distribution = normalised_counts_best_distribution - ) - - assay(normalised_counts, assay_name) <- NULL - names(assays(normalised_counts)) <- "quantile_normalised" - - if (!dir.exists(file.path(cache_dir, "pseudobulk/original"))) { - cache_dir |> file.path("pseudobulk/original") |> dir.create(recursive = TRUE) - } - - if (!dir.exists(file.path(cache_dir, "pseudobulk/quantile_normalised"))) { - cache_dir |> file.path("pseudobulk/quantile_normalised") |> dir.create(recursive = TRUE) - } - - path <- file.path(cache_dir, "pseudobulk") - pseudobulk_counts_path <- file.path(path, "original", basename(file_id)) - pseudobulk_qnorm_path <- file.path(path, "quantile_normalised", basename(file_id)) - - saveHDF5SummarizedExperiment(pseudobulk_counts, pseudobulk_counts_path ) - saveHDF5SummarizedExperiment(normalised_counts, pseudobulk_qnorm_path ) - - cli_alert_info("pseudobulk are generated in {.path {path}}. ") + # Generate pseudobulk + if (isTRUE(pseudobulk)) sce_obj |> calculate_pseudobulk(cache_dir = cache_dir) +} + + +#' Generate pseudobulk counts and quantile_normalised counts +#' @param sce_data A SingleCellExperiment object from RDS, the metadata slot of which +#' must contain `cell_` and `dataset_id` +#' @param cache_dir Optional character vector of length 1. A file path on +#' your local system to a directory (not a file) that will be used to store pseudobulk counts +#' @return Pseudobulk counts in `HDF5` format stored in the cache directory +#' @export +#' @importFrom S4Vectors metadata +#' @importFrom dplyr select distinct pull +#' @importFrom cli cli_alert_info cli_alert_warning +#' @importFrom S4Vectors metadata +#' @importFrom SummarizedExperiment assay assay<- assays +#' @importFrom tidySingleCellExperiment aggregate_cells +#' @importFrom tidybulk quantile_normalise_abundance +#' @importFrom HDF5Array saveHDF5SummarizedExperiment + +calculate_pseudobulk <- function(sce_data, + cache_dir = get_default_cache_dir()) { + metadata_tbl <- metadata(sce_data)$data + file_id <- metadata_tbl$file_id |> unique() |> as.character() + + if (!dir.exists(file.path(cache_dir, "pseudobulk/original"))) { + cache_dir |> file.path("pseudobulk/original") |> dir.create(recursive = TRUE) + } + + if (!dir.exists(file.path(cache_dir, "pseudobulk/quantile_normalised"))) { + cache_dir |> file.path("pseudobulk/quantile_normalised") |> dir.create(recursive = TRUE) } + cli_alert_info("Generating pseudobulk counts from {file_id}. ") + pseudobulk_counts <- sce_data |> aggregate_cells(c(sample_, cell_type_harmonised)) + + assay_name <- pseudobulk_counts |> assays() |> names() + normalised_counts_best_distribution <- assay(pseudobulk_counts, assay_name) |> as.matrix() |> + preprocessCore::normalize.quantiles.determine.target() + + normalised_counts <- pseudobulk_counts |> quantile_normalise_abundance( + method="preprocesscore_normalize_quantiles_use_target", + target_distribution = normalised_counts_best_distribution + ) + + assay(normalised_counts, assay_name) <- NULL + names(assays(normalised_counts)) <- "quantile_normalised" + + path <- file.path(cache_dir, "pseudobulk") + pseudobulk_counts_path <- file.path(path, "original", basename(file_id)) + pseudobulk_qnorm_path <- file.path(path, "quantile_normalised", basename(file_id)) + + saveHDF5SummarizedExperiment(pseudobulk_counts, pseudobulk_counts_path, replace = TRUE ) + saveHDF5SummarizedExperiment(normalised_counts, pseudobulk_qnorm_path , replace = TRUE) + + cli_alert_info("pseudobulk are generated in {.path {path}}. ") }