From e4999c4deedef2bb68de697ca1d14dd24fdce89b Mon Sep 17 00:00:00 2001 From: Eyad Date: Mon, 19 Sep 2022 23:58:20 -0400 Subject: [PATCH] submit --- README.md | 23 +++-- img/blocksize.png | Bin 0 -> 30456 bytes src/main.cpp | 14 +-- stream_compaction/common.cu | 20 +++++ stream_compaction/common.h | 2 + stream_compaction/cpu.cu | 46 +++++++++- stream_compaction/efficient.cu | 153 ++++++++++++++++++++++++++++++++- stream_compaction/naive.cu | 57 +++++++++++- stream_compaction/thrust.cu | 26 ++++++ 9 files changed, 321 insertions(+), 20 deletions(-) create mode 100644 img/blocksize.png diff --git a/README.md b/README.md index 0e38ddb..23de218 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,23 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) -### (TODO: Your README) +* Eyad Almoamen + * [LinkedIn](https://www.linkedin.com/in/eyadalmoamen/), [personal website](https://eyadnabeel.com) +* Tested on: Windows 11, i7-10750H CPU @ 2.60GHz 2.59 GHz 16GB, RTX 2070 Super Max-Q Design 8GB (Personal Computer) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +Introduction +====================== +I implemented exclusive scan on the CPU and on the GPU using both the naive and work-efficient methods. I've also implemented stream compaction + +Analysis +====================== +**Effect of Block Size on performance** +I ran the algorithms with variation in block size on arrays of size n = 2^14 elements, and the following graph shows the results: + +![](img/blocksize.png) + +There doesn't seem to be any sort of conclusive relation between blocksize and performance. +**Effect of number of elements on performance** +(I ran into a bug which rendered the algorithm incapable of running on arrays larger than 2^14, and therefore was not able to produce any meaningful results especially in comparison with the cpu) \ No newline at end of file diff --git a/img/blocksize.png b/img/blocksize.png new file mode 100644 index 0000000000000000000000000000000000000000..f7d316722c373ffa72275e4d5b07b51b0dbbd49e GIT binary patch literal 30456 zcmdqJcT`e;96ziX4a{tamgQ(zim7FqrMb7}21yLH%v4;J;Ko%}W{z@HntOr@4RN4C zwk$0N4qT`W2O@4oMfACTf8&qm&*z-yoZtC*j^~EK<&O8~JznF++gh87fF;2K0s29F@Ge<+<0c4rz4xw} zQ=ou=xZL)ypt=3&IRSy7L~|p3ScnUUxwirSce8nZmsFmOK>zD1xd<-f7t|80BY~1L{8K^3J%0jI8^sfh+yeO&-`-gu``<3w75A8Y= z6^o%Xm+}>-a84s*u44HrZ6dVy!~2tU`^@N{j1z{oc$|6?PsZ#lUE=8>mauXBQe1)2 zy1~r@tAfCR>V5avy(I+vL%`taN74VbN&lB@l@(tohM@jA5^s1XUtM^@Ul|N&db8T{ zQYD~%hEgsS)OWMD!s#McGe;(JwJ9fJ!F=%T%?H@2?jQSgH|9pO|7hI$vWK^^*zQ-u z4En$()+@LB$yX>jSNByoeenJE?C1)gJ2E&x_F-;J;`*w*9R_h+CUYZX7{^5&)9`=8 zdEctsTWl#&?oetyxBux2n^W=M+uEw3a6+wnAl>|I|0ZuXz{$y}ss6V^X4k)$u94oK zAB&1^TKsx$D(x+$74)+YTU2Blq!zgm61>rVb@7_k{FlSbjTaG>Dk25k5%1FMJHXNG z8&*JmU9K%WvsN`c=~s7+OI5)!G*C_&Sz2Gs!cv`uvsQ* z8oaYFXGbG%mNo)Wg>01o%o1!nNN$S^JJk z!(6QJdd&Gwbt^_i9s0~e>tyQ`#bfTq3D_qpc|{*E)brZaW8OBH+@R-3I6 zki)5;rTRyzk!2&-S*6O~JGtEixME1@D!zAXA~z~XNxWGq_zh1t8>SaFg)hBvUbd$k zQKP%H)=tFQ9BmfdecUbUlHdJVMMLM>yZM8zkB{Gf;-mfaQq*Py4+sL3;_psV8L4&0 zxECii6U!ilR#}&b85+Ydl8bim@s~=@<4}EZ6_@Fx!lb|>J`XJeGlxPavYE@!6-tTs zY$G?QQdTnqI>eGZ5>U6k%&vE&7EqJYPO}-L8oNT1I_VRd5(UnomJ4L6+LPg^-BOqo zsgTi_CN1Og+DMIzwiC;j*4No2j!o2h*M63%MjiLnP-`o=3SQ!em(c&+4`O)|*f#mwmRB&D5F^y_Tb#(2(6%WaB1Sv?+ zs_y<+Y=B~YOfFj4G+COX3i3EK@g^_{74kbSXfppXCAN4aUzKE%C|SoZKpH{_RSn=& z-t-5XIV)#{RG%I~diMPmg`K#g|GJ(e<+kJ5~~6gpRU|0=z>h<=Xz@ z)VBCXBF88UC`<`i^}TVI#RrI@V=d8-lEk7?vx5c{LUzlC!&v!Go%tm2LHL58o@Uyl zop$;ff_S69(eb4X7WJjq;DzXLcX34?F%k`5R(}M}*_&sn0@;cAsSVz_M~EEBroifh zIDY9kJ*N`?4KyDC%0(&)%1S-;gV}h@Aa2|XMd@z0h@ zlIy5^&ZQSEsHfy5~L~pbfQjml@NajrUpWY(#RH^F1 zyv|iMv&S=@ofMzfmMV>y`W?7POzx+*H$M+;cJf^%`9xU?9>vo;WtbDSxi%wxQiXI- zjK5*@HF9{77qIZ}$IS4TNq+Fgd~>~c>usOa9uX7n?F#3bHs)YozTxAmLT*ZnUk=;! zCF^c_?Uspni;7$u8;rDJCgUV7Laz2e%$W1tzpg*9{GwIO`qO3R!c$V2da*QBPNFd0 zof=zn&p=QX7c6r(wXyN^?=LJC0~+QZ0SP5|rq6q;%yRz6+1zPL=C|rgrnwWED(dem zC>l_{f-O9(?E1R`b+3+($SSXcHG?iL-uB@7$jEd4G0QJW}7h{Nssn>XtqoJ9|foXDdt7AEW5 zXWWuPpu#cxdX%i0yWx6P=_~bZHe{CxthOS-r?D+uB&`UX-j=Nu;A53sf>RHfa|@w!m(!E$XcmFFfInM2##DbYLw8$C6Rg>tCJ@ zN6nisU|**sTqAz>+D$y9nXol5HaAYL-M;#cR=(V`B5%cB5y=i{>B0{6R9>M4!g)VM zHw%?hI-3y_bbM(JgI~cZ!cdSIt?H4F2F{)X`yHdgIf&mrO#*6%CNTED=!yK((VynE-(I?1XhmNYl znm*FD_yI4k$RD6aIJ-_W;KbnNl~1Z7ZT@_XjtELou#!G1|T(&h(BNVhot#KjYK8YzGv%h z0%78nJNZgnjZ)Tj1=5fcK653w0XxKx0G(L(=jRe&44a7r9j&{gdR*S-z^HDy-oMJ4 z;em`Z7OKVk$1;(9kOL66C)GA__MBjj5Ik~&OWR`pzU&nW`URAQsA|pwuXSmZ3|4wF z49_m1^?HTE-MqKTPoAVYE{OdKSePeIA#zQECJVVj>T8MQ>IY~(}O3s;P zh2@Xf#ptfDXA=3Pn6eX_kXH)XJPth!BDMrEV(0(Wz5mZFSSpEX!j@!lQ8(Kh=2L2zq zO}PQBwDh_T4Qbb8pVI;oI_H&p(!Ijl_#QI}m!1{2#x0)}m~h!XyVs4Xn4W*$vXI2f zrL(pS2anZtCCIm)_rHCPL!Pm;`BK`o3ChDr704I+z3WEmUt5gQZ=wg`4(0qjeo3_L zY;*a&pO!ACM4Z8Uq1FXzm5cyxgWsM@^yFV*mhp<8mz9=+TY2u;TbfWlUYcv z(WJ)GAle*8q3d+7f4g||)6SYCf|B;%XBUUw^-}F`8uDz9mZP29wYfJKJ0Okz>;B}E zWnc9)`b$%7PQBx4>q1YLmt1c~wU$4+5_VT2>YRHQs-l5jy2`~wvTD8Rvor!`+VrIY zrA}pQA$MmVLpwbk=5|ih(d(&->lY>Hv3e)THy*BClp%_-M zbB#YohqP24y8fn@DPct3L_e3WnDlWxIr5ruZ(*d6{nA9cH;mw6GMJYEf$#C6P2o_A z7;X4Kr}}iPuyWs}w;9l0OwJUtl>L0%*rD~=t{3$6sjGHAy~Yx#e@F+i4MdE!(X&<@ zsG4qf2XcLKaUCb3cSz-zcn{fyWf>RhN9%Q-2=&b=())_I#^UzaSK6f&dNVP{)#M)& zPedLA*Kciv=KU>SG#1K5-Txs+1X(jKa)g)v$eS%~I!Ml;wy+{n7w`4gV1NjW zQq1;JyMZDVGVB;qQbHUZ&W}O~CO}HHOXRg89{FgLZOYNyXJxO%+>K*?)iJxC$Jo>_cwAUQvM# zoRi_~-+z-H0cn9ODWukpXZi%mr-&KFxy#&@U!<{0vTl@=?9)F=q#S02FV}!`r)G+u zDP&RtCwEi|S1QjSTtbT#^9TnBvvU%-M$(vA^qA?<$LM0=!%OjzHiFN3zjjgW6KOeY zZhO>b*XA-Qs!m)4vJOkJiUX?*^fsJW3yD^)pH@X2 z=n5mq=D1b{pYy05NFLDfNSfCvzt&5701n^OUx8lZ_1=aYN|&y(j$~qd;n9bsoQoNM z3YPE&&yjiE%eI2)^k}fWasQvvk}6{8>)RkM&c|FATTJ69y(6?KPv{!t96oOyxRYjn%>4O>rrt2b7 z41Xr2E)bcEC{F-a|LCCb*hKn!TB+92xVm1pJTzaK*TG-h*<9q6VSVvHc8AWFvvvzN&9`qA`X|UuW%RkO-3pugfhAQ(du<3 z$u6yL);7Tw!^xfugu1c9#ca!}JA(O7ia$M=VJP@^Oeh$31Xl%&L_S6uK-8APr;-XD z*Kk=~&Ezn-52C^avW`b9A}!-QD@`#z**3)XXOllt^|3{iNKF#FGY@NT z>Dr+D59&7cMnyzysH}s?KWH^L`rttEd`NTC?)m~B2OuJ1ob~Idh&UgJqdPvO0#`PR zh{iva1zm`zzngE$=!JW09c9(_@@6n#@rYTFO;Z;A>B@6b1We9&=@Oa`mzP*xpQGi@ z@Jq$>Cw+W1y|xra4x!k$x>DrN_pY#SSdaI8YhkZ?jVSt|WhQwLwKYtcPvLA%IT!Nq z+u&|Z2IQ(YzlXwB+(9?4{Gk&-$0C|fW>j0I-_w*KQ^Srl^y=mqB^Oh>c1K!xYPty6 z9{DQ1%DPpeZ=xn|I$b1w1hkXj6T zc;y}o6f@uw4X!3fZG7&bH1B#`AbAUGOy?G#np|9RZk7(O@cGRiZmjnqcSI)sYJhii z>O=t3&<#Z9CpxvK(LnPOK^xBT^!@Ej`+HFV8~THN#0$m81rKnk^^PZ(oIs*2$CsVo zRhqO!z*p@RAFWtonk?gvfTY4~1S@h9C71&zpw}jI%s?(w6H2X@GbDQIh~g0#H?QyS z8-LAeXvNn;+=R1Dp>XoMh?on~*}^(=i=c_yMeiduzv(36Kb@q7x1?3ud&jh>p)9LZ zZn54hFisUm$?P<77(Ez31rKjV*vDwmc(K?S9{V-aZG-O|-=S_Kn7q2fUzL8Z5@DHO zqxeqAe?VHPloyYcI!w-47niepTSi+R)VX|(yV>>b+)n|`&`!!)gOyOc{P(B(H;o|) zXpeeM_23QZmxPTZo2BLc;J+D@dC1v`mnIR$kBPiEzt_)-#ZfpCQE?4h)4hT}`_xMd zcJF{ulx(6>X^@W<=5a_Y**<2_a~I{MWqYW_L*;7f0ashZ z5sxsPi=Do4+WVJQUzFc2ts1XshRmWU@z8q}zE zc;f4eKE6Ju6;JO&mRWr4B%aZy~bhY zN@g8fi^^>C43bKtn>;%-XQpBY%MmIXAs^Ee9ELp8J3V8;hXJ6aPVxGhxwmU`m&7DF z3N~rqs_zm5qu`_M`>F`b46axltk;IckoZ<516loN3{nO?mP4In>4W}+3=($uHnc!F zXS4qnESfMrO$J^HqpP?FsG1*BzjbJvq_#mX0Eb>QjZ&k8bt@lya zAOE&iLRk{vB=bYw#NsJLgU*vM3GbqjTUr&OZ)0&hF$3034 zVH_=5K^EoDcj(Zm$u*+t-r6|1M+=v8iC|0Cm)*IVk1^-|@Ts{4U3v{U`+i7UM!Cpq zf7>-CkWv9a28Z(ZIdF|uKar3DY3AZzT<73<%4O%DGmNiG+%^BFwNOY*LCQdgX>|&Q zVLVPvjj%1mb|LAAZEF^^K{vBa#F0aM( zenos_1;vH%+H6Fx&V8(&c{T8ul2dB9^sZZ{gTFW*IbzeI+YrL5C`*zPI9>4NB^nk! zKHFn=$M#h(;ZMP**j?~BZlU*1b&<=@_qP1(NniMrpfc?E);_qPfiga3{5p5$!lFjM zVhVjvgpQYm1SH~xh_9Mi9C)4}x58Xql0pl1hn3go+Zc8(9mUqe74fkZeX;S-qiSwR zk7j#y6!$}ACpXRcBtJW!BAHh^<-IY(>E9gpf`Y(vVwlxHW-tPwm_O_=_KYTD-#U4G zskQ9>53a2Ivxb#i;s+MBHw^5kjwDmaK2-?6fTaNMT(7(jn5*;?2iG~D9Ba$&pY4w* zl3#rq?`0f%PB2eaPG`=6^J6~-XDmrdiOdVHIcMqf+#(J0x3j+F032ZuMP zHfb}1^C)t+-1TM$ly8KfGA&#!U9kMVJ`+l|3FppmOeA}7$EhnWf_dnM9&G7lFJAXo z2c9#}_45eNao1^6o^2e)ya?ORSbFJ!7U`0;n{Q);q-NL{zh|5_Zzj9ISbytJ5*Ex_ z*SD}+*$XF7q7gMgSPw453TZOo<$^kTk-P=AOt+l+`a3|Pc73Xxsr`@IoVb!b14#}Z zLFym6qO-|wScrhTd>eF~KLdIPBA+iYIAy4q{Dhk|e?~-pr+}P!yy!-^YWhz%!2(H3 znY78A#0~ky6{b*6W1Gb37i&oXU^(ouE6f(_dpB=(QD}FWqZ5~@(AN<_P&bczbX(pK zwyVPYl;fGM$`*-RJsgMEJ_TUOri;>k<<$;HsX0EdG?;8xV}8Mo*DIA2U&0xeIB`&9 zJbj+N0PcnCS)ah+g6_RNG7XVV*jzi$0wEL2A8 zrnkI4-w`@GtL0#9X9o|PyuRw=uOFj-6Rp|2 zIU8jie@y3Kmhq3etp6RE)Y#bU>R^TTR3#+E8F}URPeA6`je&t zpM+V-DS*&`m$i4Qh4i`gcT?eAck4#f;qFmudge-gF5TGpa{Gc(J=(q5CeoU`k0 zC)SQ}U!<>q)3I(Ge3-iyyCAe23(Qq@`0%(w5A+r!!(Z#jdADev-;4Ub=dv$S>G<;u z{gGp*q(P~XGOSFF^9HPQa<^sY_tv%SWbhZVRHjMN@z>Si8%8zWWhWhIxvn{t{KCh- zM(#CBDMy(%yURsv7HuHbbOppr;`eXBo^VC2AmlG)<+t_IF7TdR2i+hieQCWbw$FQR zKdNH6F|4+tRQ6MHP>Hw-GeYK=8!JFN*gS5p2beNz_4;-sEF3>^iFK0r%v`APbXNi1 z;L%K_gKfbGV;yMo*`d2ikR~x~pf(TTMAwIaLy&05((SQl2SQ~P^ZXb@)F~$`8u04vKY5;o5hVtY}`e%J8_ccUcik|%*m+{3->i<7d}QN&|xP9 z1MKVt*fIwymf@Mtq(E@Nnqfp3Bk1c);4eg`c`;V*9P9}^tdrWgR;qTya-xB930VKe z46Gf``~i6FX`z?JV@Zr*%nM{UCh2rwj(o?0_DR@BtCiuum>XaiGkGM`DDt5s13ZE8*FuZ;(a*KR;f8j-P25ZsP>MO`{cEw8a zLTF|3!r8IFf(?7dT~t?94pP}gvNKp-rZY%h7k(fYA?K^FlrcZd;I46cvHIzI=F3#| z7^uqRUKFeJe4034-9;Ol)+>;A{;ATOL*;*nA@aek(rr@2RCbBY2pIg3g6Zrre0sQe z)C>hT5_;kF&kR=smVnAOtc_>+Pe$@9uNjX>WrzCYm}qv*LJRFjRRNR~@u1Y;!zN=S zlf(tuRWdcyZtkDy>u$>GtXUem9INt3^->R&uxx>{B5Om3ytLr*os+jD(>VZjl00b5 zJWtZUwzL+z`st}mlLOC`UHJ??n9C^IXR~w^qyRPZ9@JSK71A&YS-N0*Lhn?O!s;Q= z05#*P)4Dd+)ILT(rWSm#<@8@|v_Uu4UP#?Y#@S2QG9@KeT63K3=1Enh*n+`YoH*a_OAxY`Wnv15tlVbyOV9a{L~y9-9?g{>-HnFDru!Y*hY7D0aw42;s_7DrOy1ie{cu-YR<0?@M0(TvKG^kPl$M z$}=seBiJca!!4fhdsRthh&1Yz<2XLk5ce2;ewkJ}VrP}78p+uWv4(}1Rh1xe5jV|K zs|{a2s5U-|N5je4Q&VMr+&F~C5%fC;L{}fDdxaI)QXw{4cA#*_E2LqBWr6A?zhM2; z6>@+Ayrfj$zA9-T>Rq!Qd8p+&SqG_-c&-#{mL6&2ZK^&|pq6i7IKA?8_Gz&}bhTZx zeVj2(3TabGa@7zCmDZ-;z#x-?pB?_)WwsQl@+1R3TF_4DD2Ppt^>O`B!Liqb4bXW5 z%8+LTHRjExAp7FyA?zI2fy4~P{Z5`qoNKQ1t+ARL4Qeb%5Z{Pr+3WzZMDInE*Md_w z2lBYE`9hy(y%%X5Gm|BC#pkm>eU25-OhLgit45Yfmsdw@6JnA0jVonUq#ZSp4m|S- z!mgM02^4y2n?#gIw54t^u7`&tYpbNYR8f&8 zOG8drn`h}s{E7AleSdQSyfR^jYGd#0WG2acuw=QbXy(JF6K3jO;ngp@ZR6)?L(fp623XO#7O^~yVjoC$Z+V06Q@_Lv ziBEIhCRMJ>nlw%i)oy@(VBEsBlP1&X4>ofbKK0ccq#o#wC`*k+T4^F-yGr z+viETGQjWZ1|Ltm<31Yz^{LW@J&5YE(x_YfUTT@mK;NtV{pi>YcCC%B9Bb9{MYSu# zbnE+Jxb>RPvz7E6=RramBN2f)m26PwgxHhvj*1vXfq_C( z(U+c+eQOKRN%*8n=$?SaWk*K#RT?Hk9z-Ub?+9l10nnGn*q!%3Cf*AixR1VhO#JynzS?RK7sKbv#6^2?1Vfg;Oz-+7_vf&y`S=3!p9 zgH4^kYp9(c4fz`$C`00%pf`)w7P>j3SkdpTQE5Huj;O?*1R()>%tukZ*9H1Gu@07w z8M7Udj2=`E%cal>%Rfd8;|uaL& zlD16|E5}WZ5oh8$3m5U}HV@P%L(yTx*GL{_L=;3SD2?&JBxjqzVYd6z;F3E{7^mU( z3UflynBUU60v<^wt;a6;_60ro^JPKk<7B5oq~3lpB~XN>vgG$aPZfz~p<VFh=EzU&OZ@A$KLq@>cI6-DxCG+q&|K;{p_TQe3 z|FgD$@+X#|hhJO`&zi2m;?MrRBhu2pce!PERDK0gYw1F(6ZPDxVkks1*Go%7OMO*X zKI?vDWRve&-zZ6J`aQ`*CIm6JTR@h%XTuh)y*h3Z9ymTeO6nG=FDc-B&`KQf=*cwQ z-M8M^4vH%F=bkuKv7x^lON)Ll_kP`BaN);%lPkV;m{#v06X7~1At0MA&nf6eF8PE_ zoy#CuZ-$pxW!(W7s$~9R?pBq^u}eDxxAH=3hQrFU{xs(1b{3jCmpfKY4+hXg#>E=9 z#VY{A=!U8fB7fem_0!Lp-CN1|{aB`Yto7w?%ebx#8Hrsa@Y#u%#ZBd3}z4|vfc(!rwld!Vi@?_`d;wyf$z@hj7@i8P-uo#|m;*4j7G5%} z5Rv~F%mHh^thVwWsb`(7t*zg}R*f#51G_-zlr50Tip)QpvV{=-2Cu zXIB*iG+i`ZH+dsl1AtbBv(i-j(gh7H`31yIJ4tl-g@-Tsyj;s*f5Lh9xt^+4S4xepg zaD>PuQyX;vSv3t@_|ikgd(xJ1TR_oipQ-)3-)Eo-KWGhV(9CVS{C8jYzvreEQCpic zVLFi;tD}cKOwH?b1mr;f-DWrW^Y(K!K%4L8KMXs(25{6k=ir)DYt0`sw5>=6ecU*B z8K(Hd&QywQ7_8SbN3e8wsf_X+Yf1Odq0qJ0{-w@nEi)xtLh>7dF#y!NKtz&TKr-8E zsB{$?Urz%aWA__3GgRlVrCmv)#Ef&DRY+`_5J%%8# z3apWOkF=m$_0L=0d$iu*o`*<7UO_UpILG{Jn<`7``n(kmvuG(#`P+af@(v(m(!#XT zSFB#EekG5WL6EG!ZCbOKn4&N2mg06Vnzz;;oXN%nj5Xh)NU9y}DjhRVJySP3$0jbr z^zv~;wPWo-f%m6vEF=up04SyF`QUOe1X?@uc!3!_Qnyu{=XbdB^W5*xHQ14^`%~j= zhTWn5jPsUvMkcV=_&VKGDiBT6-%lk2+@&V>kh#r^mUr}Jcq{8o4;O!{IXJ2}9C?Q3 zQ!5%4JRIsNG3to7mkaS!M0EjFpJ;cu9?znC*`e~`T5;p23ZI5{^cB>EyQ0P6{(u5T zXtoMSiZRJHgbup&wkbT6*uKl#YBLmWxrb6q&5rAKiEd6yg$Se4qwAQx!4MmW?{0kV(gb-+~9=-|hWE)TFjzgmzQf%R%~; z_v4BM5c);*pWEuYtxYZsh3qMcO|PB@PB*Cai{n+VaClkc^!wHu42G!;t{Z+vo--23 zpBq(MO;kcdszji*KH5O|vB7l!rFP_L#z^%;CsiIb&z-=t zT@D3-2PQNn4x8KHr4<_)^}zwbE>k)8vL!TUF-qNGtbww?t193SAkVpPYb{Z3hmsfe zppJDXx(<>!4(AKmk0?Gq*|>q$+2iIOC_Z_kB+LIk;|%uw)M$WAK{*gs=^2T712ZJ@s>Hslqr1QRH_V$z z=uKUm-|-EgP8c4;V0m~-M^ITc40RadRrcuWYrmeL8~46Kh>A%WU0W^DJzwo!hrYsh z6y^2Tv?+7|T0$F?@GD9zbPf>o(X0Q~ge{{HwLOi27RS&WKER;61LZeZFl&WOv4<)V z?Dd+}G+!ZpUq^-6Tp1HXT(tMAXn}T2W$!}W*^m^x3yOS_hw#IpsYt1>jp>{gO>_L;n{%b676}-7K*Q>v1anSJY@5WGV*H!(xc(q3kFhrfAqQ6lG741KGAXQu_AoEaGu6)lYfQx!8%!ZwaUQ0}?>$t!;kD(xHMvSSU$7$y ztQ@tun9H2#vBCh-X5z6I^!(CZCxBLHsCrf^>i~LvT~ivJn0@(Pfufn5=64JI2lYlW zTQD!T{p7njfC_hzyo&%bC9j}Gl8XcNayu8$xH&+WSBZRdOHnLsrTfKSh0)%iulot9 zPqr1S1_j_`%>X#7z+f!@vGf#FyK28?l-zT_uF$!k7u#!jk%iuTvFpu_;>Cf^7Foc( zi^L=@G#)$MxdiKTFV6Vd=ab)S$mpNcJDm}$13!-E1s7B`P+sN+4R{Y0YlEyPzw{u% zosh&hCj>4x*4{=NB6h=P3;*JLCqJWwESvMmKio;P^nmrwNIgk;cgv#IQm{01sQhi7 zqdEwQ_ULSwu9e|n?}ANl3LybkL_ygb6p%G;5Qc8B-Pr&4f)(SmCqWT8cD zD!9jAakWEFU@8coxd@1t2SHKmO5_f_a|vbe-`P$-c+6J`=@v=QXXq>7YJVzRsYF)5 zULEAPgC1#04+Ll?K{4JbVs-~AX@$aPa>iMG6Y0bPyoW8SX^BUTJ#oA(+QjHLm+6}TKuvTDyE8J{ThZ(pSfU9<%^A9Ua{~TOs z!IdY+Js)Yruh~%?1XYLY=6an2>Ud`_>N7y(w*;&{YDnzMwhTqt&F1!C)w7uk_QE7e zL^&@j_9#lU`%iMiSj}u|x>*>M=w9pxQ_P%6Z(|Qstf7C*_LMV%^wIb1Q;2y`AXuiz z0v}g^G_Dpuvw-&xBzUZJr(1*29xgJtrRS}En9sEo7@-4o0D@e?p%d*|b7$i1E%W5f z95l`{gC$T7ulGP=)t)?_VJC2%AUT-ahp{#g>8iwa6etJHPbir%#XF+(4R2PHE9_EA z{Y|El%JW>fcvvM;JtUZoi-z{Ga3WXpdiAgu4)aNXb~x{ynoU!912Ov@Y}7uJZkDC# z5AL=!(6$5;nvMiphp`lj_XIhKB3>q3NSvS~TQTR= z%V53(T~xcVf`wEJT={D?W=bQ?w5-Rb(m#Ht21W$U-@ZQ>^Ww=zQM1BHywOt=Cv=l)iyh52DNd^y*%U7{N@Fv68B|ADl`T95&%G$&^f|e+cjhp*t zFV1rX3BvDo_(K-1f_^t24}8`L@Dzh0e+qEgLDSb6nvMU13oBG`Azj$)I$oyN5wGBw^tE%*=h2HNLHhwVd zrFRy!2pO6jkj`G|u7tOz^iZEs+uxl}Q&DhR#6)0}9E?+mxc~QMJO%0N$+lx2E5v_u z0zCnQ+L_X2@`{;Z))lK zZjbk&c3fBwaY|Yompn7k<+)6wn0xP3)1EbJ%Jn*$m{aO#fE9(=;zv5Q)g=?HUWfb_ zikWsXCOrE%HO`0ozdbTT~ zBhpsThB;pqRh6su=D-|BZsK1F=a?|lZz2~_cx>E82xC?FW#cn40Ww9qK%PH1Vl=9~n z$$UvKOC_|9Ob4|if?TmyRq9iiV69du=#A}6rEp5 z=aPJW7+$2uW>5C9dj%CdnRlE(gO!7+c7*D2<4L zF70S^Kg`&*pm6<+fnc4DN2>#Rd_)a^uAJ{@G@`G# zzLD5Sr%Y#hIWnB$OsC3C)ecJ+i*tj5b zO1{9?9?|}0#BpbooK2H^(62A1?K?lozmJSDcD%M{(?UNEnbFwHYLb!vH9566?HYi3 z0zU`tW0rRU+vt>2bK7>i{}Z;B`qtJ-jl7xqpl?S#uanU|6@W1)s0WVdj|BOnZ^i3vZx;Z_WoiWY1NNo;n>)5I#**xu zCH>FW1PxM!<}FEV_d5MC)r>_=wgL^u|;V03jQ8;b7PH|Fri3lxCQjgM)us@ z|9uy9&jwvjkm(%I=m9`V@3*0)I4}e-h}7KqB`L9r&zkVe#oZCrS=?W z?4i03jlgt4EWSH8+BAdP{VTJ(hN0X`^uoB+e$f|omrY+C56#E{)RwemCs@ z_GN=_|NF0enQ)m=z+H5srhQw$BB|ze;Af&u%*aq(-~-zBA*2N@vl(aS6uoQyi0U+)h;H7+N+i?~^5A(>r{d8as`5S9$QyC0|Pf0c~V^k0N*MkDLU8}O% zUh|gpB~}*)fOmtL7l3&*O)8cOaHD;IRd2#%1$H;&huq7)FY$*f0l00(g$f`q6#sm* zTz7baxYB);2>>Je7(Uu}uU+WBRfz$@;DyQG?#b*>7U0x!HhIwSWmjZ@;7uS}v;k*d z?y;Z7g8u-lbv$Pb(HMrU*|&WXMTcYlCmF!~-x=nAm}-H!5BPt*@Bg=KJ&f;zNP+^z zdbiiIsy!lIKlF6n-;yt|3|X7XfY(1r_nEUj52UrUJ3zLxeQMMj<>`Dzqocg?;?wKB zRK~%LxF>X*e=jwDULx%o7mIOW-LuLF9NGLvN16# zns$d+9CF9GaX_6ij)%!_v@^>g|muZ`hHJ2aVcQh3oUagow`m#Uf<~g!5 z(domPRbf3Rd+&a|@;6!@^!J8QdtCC>%CD4L&p|Kt_Da%Gv0 zBKCw;JOD^!Jr#}NkV5ADRwd$?vI=-w?0Ax-dW-hA6Ug`fD+{N|kuP2KCI^;YTaT#0 zHz}swG2QIF9Fl(2GKh6ccfwjA`b#&-HTNUwG<{F@gIyx;0V1sXagLM zke}*V7u+8KkpbLspNzTS|1xXAx{orD7& zTJ-xwaoTCzQm5%mXgHe8Yu4SosU5b|=bw50W^BEuiVEOue+O`s^$aOZ%w7Ev2;#;~ zpxHS!x!u#*p7sJrb^C|nx2OEp1ZWfh&kd7diwu^8&9(yB4?c60i=5Yfjw9L58KvNm=QRe$Gm5q#qq~-jHrE@U9Nsux={<`CVr&s8uHqF((1`X?>CF+m)7l?tK{i{8@gB z)QyoKfBMx`VKb36*bn2;-2%mXjzaB?M-gGbtn#9A!!!89KC%8F9ef)##-UmRdZizWeW26iu?r@ChY zd~0i19Yta{{8FQ?)v~) zeUPBDYUJjcCpVOQXgG{rIcT*#P-Og%fdZ&9P1}7uP*jwk0OU8L0$})?v_L>1jo~zM zhaG`_-);Z)ZpD;<-KHrb+LS4iC&T|HYD_;oY`te({M@1VzKyj-ziql0;N!n;KhC1u zuJ92G+XGjY{jZi4=#9RNeITmS7q&F{98=B&^3#n`4Soa=hCsuck_TrrA0G%##e>9S^y`&er$6f!@GgEM`K!^ zmTjG|uZ~!|&+dCyxcz(*el8ejc5%Fc)%m~7Q&JdFw@9_j$^Tk&+svMUxuTx7Y*p4G zvt$>^R`U6s6vC;f)@gxT|FbN}H)z(l90qbiy3$}@i=*eRD8U~maxG9m8ms)@Rmkke zu1M!<-+NnZohC0}DhXuj&o^Dy23r#>diO-tdsp~uqj{$F&IU>kUP8zWYZtkSLcnk zpO?ZOXDJ++?kRtX1I*}`B3AEiKTqXB@PVK!Bg^mJtTjHMsTM{gj66WdCz@R}>lvTd z0+z@r-p4^DUbIV-tNb$$rDS2CJdIhBsx0abr+^pmFUr3R{jG`Qhe;T*w%%&aeH7XT z$u0rpzH@z$-+9K3nqL3`*JZ7}3>yTV@&Y`h1o%mj{@VMe2HgXdn0_X$J-{4X0Y=ST zv#;PJ7v^@=czNS~e0Idb51=%WN3&m?rHSh@Nu_?T0BS_tZ=jpmssVVx>j5(X2LNKR zv)hlL*d-!)8wfx03ByO%fltNS6rRg--1p<;ue3;D;lEYdQl#*zJ`fx~fX}LZxKFO3 zxxNvn(G$dV)6M|t0`W~qmyG#;+WYRPrqXxah$0FqHbg~`0SiTn4MYf_2q-AMmw-w& z6ltM^B4n_DfT$pX6h)*-kP=El5fzapReAtHkQz|BfZX@PnKN^K_x#S9f9_fLu60>! z)+`~}+1dNsZ+V{QeLn2Z`F`}E&`M)Y*j_=DKQYb}YMZXN4ICFQ#mTh0h3IVc0I^#; znjPJ~2Xko0`~}qC1H{>qX(q=xIpSMVElR7wR;zagWd{cH-&c$lr0dNvY`NsTCFj#O zSX#D|-PvEURyfuYe}A1h%{QrFdsbH0+49I_Q(e)@&qY)qa*mmJ4n&Cdrh_%ft9PNz z<5T@%as>ZA&$Z~(f35kWeh1V7<3w~n{g3J4 ztJ34S50;lF{PIgd&!U%grM!2Ce1k#D$oySq-|eOkR5Es>t6yyMO(6~b6DkN{&H^oX zlR{0M+=@58=|#w99+|e1g+VSnjMK-04UQfxSme8a_gS13nS44Vf0@JjW>xt%w1{x) ze9v?E@4sJef8=(u0aGbJB5km+$8dAdAwUWIC7@{Ijzevzo$pOBb91FSoact6due@JmiVfBaz&;V zqch_4T>a1ez4Z&s>ibb`KlLON>IEF7d(IKRz#lJE7&O2mhpk2mDhl94#vo*&zwMG+ zS#rTQr-Y9Kdg1V{4*LJDNKw;k&@!;fGGh*s0!n7CDsh1mIAY+tMAqm`SX9OFm%@J!jRTGEfrDnVSr@3=73y~aN!B|QS~zl2LLHfHCp9v|0q zxW{j$m~Ck+jM&o`;lGs&eHY2L?T1=3_e@=&i_ZPPK2rvg7OiP0n zjN9qmHVGgQV&wcvLiQYgpQfM5Fk1Kmu06R(tOjAzG?*9j@PJ}RBd4-vcN9Bjp`(b; zBmB1*zs)$z`TQ#zmzvXBQ?(DrV2OF$uJvbTU$1310F6S1TI^|rm`O$_^R$uuO@(w) zwtAynjJX$dWmMVo!*90=!m)?dp4gcAEmI(V)H46>(23-dZ&VT1&lcH2+a{(XRCeN*@@3BA5dNE2pva%o67?`+|v*#XdE!k{(D^0qW=l z7%j~s{_#BA^AN2J0n0*apO!@Bu!0BcecB70u1>DzUU{LQgOg1zi#Xo}OB$5cU3lhz zZ-+vD7$z!(^lGES+|b#n+MDk=1ueT5CDyIQP!DpNxhVQmf&ly}*?a;tIJwMhy9y$) z=(q9>f|0M)`P~V_jpg0Gi}qX~{sKR%nn*FTlDD}4JxXp^1zC9A%D2x~!i4@6WZOlw z>H#5*EfG{+TeNR&;Wh!OBK({;&06P+OrOG>kUVJ|q;Jmbc%akg+@lypQ)=me=l%Vg zycUK0pN%W_Y9G(JV2MXn1m^iOSW4iXMO%a0-LM1wCI|n2$NFW`1@}MKj_rZ}ZAt8Xs@fxVFO1LVgz|JfO1W1O}ks{;q zD@$JqiJsRU*6mO`RR;OI&uBqSZrOEC$_Ll8x-QR~yr1}%Zc>=dHfU8BXOHR{g#04l zz(F$lE3C$A5TtTKDnlG;cYc&uE zf9c-?-F^lvd%b)I)46-I0E={jO8KRAUED0de})IPZ(M~gR)?KSJ*NiSnK@0&o08St z>qe&lTCoE}FN8T*Cw$#zgeyfVzOY9INOPFgKd zR-d&;4<7VWUEwV!qjC}fvw%(k{RG0}YzELw*9}p?ShCnKS2Ij$EjYhrH|e1SlF8_= zm4lwC)RUPCx3NQ4jGV{rhf*Q`Fqpg{-p_L#w5^p1OJ5Pq_=v&SI1Q1%?RYJ)-S zG4Qv`LWMD`dPk_PV@ASCENIw+CUrGrOkGTXMe(YiH|_aUjJimV!}usg@)BY zd6+hq9Xw7QCAUD%=!9u0`!MjyyF60=l*IN^MTi*>_+UO3h8WA91pG53PyK(3joEM| z3R&_i)Ijo>tP6t2d7a9qdgqz1dwuYRt}CFBxoJZZcoF3f*2d73bpaoGkELn&_x}r& zhXS|=Vov@ZNuAjp!eXb*=VINvrQonf0efuKodJ{#`ZYk^N$%hCEh@KjipL?X4)*y^ za4O0|Bn_;Lg0BeTlkY%R*dcUpZ&GgPkRjPb3SpN>(D(|X&2Lf0ugZftJCR=^g^pk!9D|Oskn#1Sxu-_hcDk&D1LIbHEBhnCMWjkHk}^ z%6|d-P;_s#kN26)iV)VPiiYNQIR}Hv)JOUvmE8gGr?O-fR9+bn2zd}5nZe={?%I&0 zV$kOo$FONnVFfXqwk$LhyLeqZYsY4Ip)X%C@xjQ-(nV-=$c2M@xqucX-c`tz_4!dS zx1$|LxqnlTSJYuZIu^A}Ph7;ZpAfjaMpdfU8x3>`K8WQ0p)kps%2&(F(VnPmG+Pat zd&~EOpEK8G*8EGWRdJF9)q-l&<#YkZK;g*OVi)gkTza7vJF=eOIhv9G5_k~c-Av_A z)`!GdpR%dvO-C+%YTm3%gCAqLmIB|Txs0BKX3d~IJ0!!-?FG>k|Mr2NH1v0M?lF6d z&Za@vRZgBLa-fwgDoyHJEnV-qS|*&s?}y(9r+$#;nicPIcqdQmz5~H0v`>Hfg*>Vr znr!M7@mlu|&_EhJd*pkX=2>>>F>zXdsPqtP^yO+J9!_|7)gzp3g!AFbUl7bV&RS(* zS1C*8J)y=;T3HTlJMtvpG*)Z$W1TX-$-TN(7W+M5!k}Jwo-MF!^_%m?-KF;@lkZTi z)gSQS6IS}^t(@4vah_6Ol{!HZJ{o6S0PU?gx&7{*<8}afBJ`$czf;poyQ_`mprz{v`*v~ zc~ouCo4iK#eJJr0rzM_i_DS3y(4m@;k2H?%|7MMm058<-lvLnSO^CIK0qBZS#~ zaAfW8-}frf(eI<(T(wki>ctIgR2C=|4%{RzX$+ndsJInCW$F3Zr$ih~yrd|B{r>R~ z?Ms%M#EhSDfEi>TfmlGJ9-ek(Uypg{ns%b-=kQF+_ijo{U);0D9a{|W>|y+2XRrMA z8Vl;B-gq&)SM72GJ8jhyKXKLI;(kh(?caVMZZ^M5=su}NNbpsH8g9)R5@b83+4Q~sS)feeE zeRN!#o)^ZD%+_KOEMV9I^Kg&$N`AShr(G<%mT*;{#W?smCDKoNU7bt^*h>!1NdED+ zv3Hkt-C5_iGmHJJ-HQQsjK&p2o>kqI3U&$5fBbE{Zcos$H5dU5oJ>N^2OT`r4+l5@ zgERZ5-{61db!I_-#%drY5^GV>cw|ox^!;{HQc|msDL5Na>CFXB z{eR(}2Sg0B-%;klywsK)lZ^U51L(Z$fEL?z=UhOcw9$VFYaaObbXZ*uEt%=f!qnGJ zpyct$_ZRiKdWlM#X(MUrCh|xvfB|ek>;i%T&%`t3*+2uG^GCXz+O!1Wv!AIH3~vFe z`Ag}rd8Z4E-n5Uk=a3GfiiX!Gn{Q!8_1%{DVf!7C*d4pFG}V%DhGfpr14VTf8ue(9 zy1G`uFwgEWSMvbyg@m4`WtYEzr{gPxR)H7ub;;91FflSV_*F6SmeB1@!42`_c8S)5 zrM1JeV*}{{`d0sK`b?YVtG#EXSRj~z|(2OB{ zZPb2dU#Rmp{}fv8Ug+VO{g_W&=Vz(3(gM$tm77EHAwVgFjq|c1GCsOy;_W>euYFZ} zJiXwcP0~-|L?4xA+}-_?YT9hXJsnIrJeY9sHl$?g@(+u+;%m_4E3^lTL;uF*Gd*Iz%i!gGe*M&+gZ9D+F7Q7E296$ zHPznk6lQiG1iE6a2aglPcexcmLB7x6M##XGdNl7%VZ?$t8cy6=n3?1>Vw)W2l?DnMhrb=pwl`vSytDK7jXMxyF#Dd%vF)}0+Hv5n-Pm2id^gY- zt=d~tB4st}Jr)KY(O6#N7oc>m0Oh;jC}egn!CWZ)JlBVy^SdVmgu)5i#&=5Kz>E{N z;W}n<+Z4ty5DJ@avJm$3Q`XMr;jGO+#z5XQ#Na3bOs1;{#GOV68~)?5*|4OQfKdyx zsQ(Z*!bn}5awr^#FNK!?w_Uc|0CB)Fj~sx1;w$a8=wm-gohx5cGTv(Pw7Y1p?+3A8&-liOa>t!I6};< z8$4ia(KIYtFLiNPd%z#dw7zi+=Ae82i&+0BT8u8u%d)GwkyIpryMIvCB%Kb9@`KcC zUPwBP=8M3S{7U%w15vc%H{QF@(Sg@|^!z{fUb;yt9H{&E0i?~7(0-)GdNu0nqQnP~ z4O`c>=9FzU?P;LS02Evi{MawYl^Os2qi=5JIs%IsFoFQ7JHmgy-*B6QkZKipUys4; zd=|1*S3Y`fv?)1^=m$DP>D-YEmu0!^K_TXFLgS&D^j9^m{E~Is&qfQDR}JtKi&#z# zjc03z+Rw-S7-+nW-vVI!g)m@W3bN#aSV7ma+7?;=tu3OTB7!gi`of$ur1gdXp9Hx7 z)QGAB|Gngm@Xh5<2@LU13ZFTFsRpBDJILD^ps_g`g-7`9Xk$_^+D(7}q|CeVkA{ET zxU?Zs`13!S4?Jk$asY1!ZA;#pKa~vDy?TGXbi>m9T!0;o0Ts4~*$r|819iJ-xK2+7 zd#@O}^|Jee5Q2D4oKE^K<1WhHy!})6)-%!^%JL? z@MUV@-d;vtqap^ccnz2=Ye8~5qBNu%t=VDQo}Hx1uOjUE7`6lWlws`NdDToq0_K$4 z0T7ji`=~Ddm7TiY2kiX`!1mEbgUDiDJMRsahgL|yQ1cv5Yr>?;A*>7(3yh8((ue>7 z0`$9iYR0fS|J~8ONpn+GUp1(LSPN@w&M2N|sY^|H_MHV2Vt^kzfIq&y=Mx?U3E<&6 z0l|K?PUSRAsl11h!UhB&d0<0ZhqWt(Qf?MT(%MDxyRt>Jb*vT<0A3NwJq`}+d5>^j zmDuR3Fj$R7iBUa6cNUOdWt(kwalH8XEzGYVR~w)`I7%#RhJY{A>TI%Rlq@=+iCU7B z0GrPF#*;~oJtbvN#)tPlcjZSUDzk8dO!3#$MtSC3xPI;RPZj7Vhv2lXxAK|>Z{2d| z1B+lF34Idb=3spn_L1XKn=_K?YC#s2^F^o z9`VRT;C0q`BK8k%SwNwmhHuh|<3L(Gst}|=;<1Zkc z4Euunzz3iV+n^GkW9c%%0@P!7FT$YiGVK8zZgCP|kilRj{hBVfJ!j_{Q z&?7!T7dLgEF@U9X4*3CpfVuKCtG$vl47#+3yKvHuq(@y$Ed*ODC^o~9EBm8J(3JAH z7dN<7AOc49*4p)(HVh`5s20Nta4RO?0$^K->N>o=a&xzdQFS#)n>eIk{-2*;>28}t zA>2kmXT+n-Y5q7E&2w(}LhD@!ue4Du_|p7<{;5tf^s4$pWxjtH|9;A60=P^ys-Pn1 zcAll+6&(D#04KN#n+I$fj2^{l0}_RB=tUxoG|$3IsU3}o9Prdi*Aqd6mWU~}8Or;e zHHvKmR?*~L(=fxE-?d3R3Qtzjr9;&*A_$1Mv$#9Hb_9@r1VTi{w08i|x~GUFuC3pT z;6Yg!K1EX5bZl4>StpoNEDyxbPP7w=3?FLV5Y~(ESZOfU*wPJm zq-93wc9LE%>IC3EIszu^C*ImMH$r_r=Lgf)=AtUc;Nhu4wba%3cb)b;0|iEbsP@RN z#FDWQ>S3hBtLEjHS46araS6~Yq8jJg6M`_@?)13?a-0h10{s?j4zZ8ecjqwTmhvEY zGP`UeP+OG(59n@x0%a6q!hqVNJz?fGoK*HCL+6#&-gG0%NM6+;(+ACTm52>qevsru zms4*LV7Cvdr3Muid~=A{X>3ZL)`RanZqltM1!j4UB!wfLIrCapR##(%q5+K`2k-L~ zp}UQ*KqhUCI!AP`q~WR52kG8BF12zcrJEAk4@Qyh-+j&gfpn|_1SlQM#5A|@p3>4! zfDL)LD+_g%LKRI8D>B+74CGu9=@rtkR4*Y$i3l38ai|k>q5WyGyqTsLFXQR_>Pw5I zZcZBU$Q9u&TOA-)(qLhCatuB=={BN69Ei~GO4x*0te9kfDV^Ke%(?oR12{_^U)P>^hO|AMN=ctTR|&kwmotGW$dbXd+0`HE;=b?8RXh`a z%@QOj!$wZ7;fn1~1MV9UKXIt8=>dZSNBBm0SkV28Ci;A@JZzfJm6V9~DADD09d1l| z&5*nSSo&KBaHCnQHv8GP-r*Zq>hbH}*8U8l9kYC}Cny_fPVJ9ehUHI6+A1N7O$j&( zH_Pqomj~*ER&L4{q;_%pOan;)ePzTa0e6JDCG}Cj`h}sEj2Mbd!d!p#y2gz`m~RqfV9qyG^m&+TU!{%M4fmTiJAPYOh7I# zk28+zv4yg(P{Rx|Zm$_z$b4JgeMO;+9^H~}h<%z#3HT5qrVnb+0A*aNCp_nCk< zz${b|ZNR~2n?@^CD)q7)Fh~2!(7!h?ANQ(X$LxQNLnkBknyMoV0*tQ@Jky>)Ygm4#S^K-8o<^W>m4CA<(Z_7H+3$dWaC$)-B%}NkWlV%DHqdR;NDlU(eknvVp25C{o@XnlKv9L>dyk zAwXwUVNab$o74a~!o__NfjP8*VoalIeh%?GJNXac77634HTpdrN&&Eb_I5s`X(w; zGu^Cc+b0b|N}qXiiYA#W-pK8HnueDCwA}KrI-{gyh^YuC%$;Vgw5DWQ^#oR+r*Zk6 zR|0Xm*tjYk=QE?L|Lu8&TnY;SoA^E0mNI=EMKO2C5y>hJ11!qe3P!v7SB3Jy*FYt2zsJ|iw69j zO6nswljc00E3_x*nbSLyS=WZ#OBU6E?W;&!15?0|WI+fb(ugb1hSx8F?@I$cG09&_ zLm~w;3qD!2pyxwn2|7DgkZrbUu+Fw5a#=$w%xUV(nHOhXAf31fI&srmC$eAOG->rQ z-&;V^oK-13*F+*qjkqP`CX~rMbRU-0U=9{~E__{Q;GvL!u4WQ3_r!-=Mhnd`iX^pR zoMy;9eoMFM-_h>3%e6Jx;wXaT81$CK=HD(y+e}JNE zKRCpAeSw%0Rud~N=^iCqla*wUB$0( zjr~`Fr{DCHD)Ib5Xwnj-q(*=|MIr5HShy7bI8c(bK05Vcshj z>f<&w*#(RIpNKm@{_)gXoY(Nd)xuuqvmXma!wU`BROKpck$Q(_#5ji*hGB252XqL| zqVh)2b-uEwoSv$aGX@6}x&(;cD>7DAmU~v(k*y5G&JsF^NAs*|qs?tG@`e!{l^jLu zF$otnN9l0$?4aEQR5Zk#{5V^wmIjqR$Tpo?psi>T>NNR}`4GjV12hd-AZuivuI z2FgX(%pKoAB2i&q0WM_7M1y@W8Z}Q~QZs&&%i$H$4MDF-0z%nxn+Tva?=;*i*KRrW z8pb*7kq&B*aehr7oSFmdBNfQD@8s|dJj{EJd}FCqwUAO5@q`(Wd^mJd38YYSMkpl6 z!+4ko!6)5~jE%KNauyhyu$>TkFW|dlUC%Cp45X1TlrvTr(Di(FjPQfXqahPF?cKaEJnMy~tSU0o5jLU<{TV zYl)LEvp({bsf9u&&ZBQ5=aYeCAd_3Nt}3@4^MI`ZR-qLT-sAyQ-OBfPzkUAt-@i41 zf8MGV3nI=(2Edc)y6KU|D=^C52z`FOz4S@J`KE~<!wdt1aQSNfaZo&Xa&hVhGO5@Lb>k)g*JIc!<3JM7j!V>GOa-dhc_sZO&(oP z=f5r#<&DafR}p6T#N?h9tnZJ_8NmUvAwz{wN*9_>0Y6QDdQhB)`3|hpY!oD0lvX-{ zm~%aol7|P1U>20f1s4{rK~R$o{w6gLT)#vzV;-8Up5!I|%hxbFLwtu1Q>+9F8hi+> zulwjHn5V6es0Koq#$FNDUM&hBGQZ1Z_UN$1GHGHTsjo2?YR*m}$Bd|@DPsjD zwZ)@2nG5z_(lI@SrhSFYO1vjO?r(mR+zdEaj)IMQt0en#fHwX@gH{IzD%<%m@^4U{WNS1IwF6T$=X7HAhT?9{f6~{9R(j zSsU}+KFT;cc3J+Iky2RdZ3b-Tn65_w?DgO$CRntdXwlwQI1|C3G~Ezn^}T|2{v#)i z26E!s9Io%6yhH**A2rguA)GX`zZXw0jGvH_YJh@~ZY|d8NhA(DvpsQ%Qe-R6A@bl6 zEGn9^luaS>1>RL)!z%G9ASLpgn`%gsxw=R&wN-=Nr**Cz^zkG_?mOhFZ{FQMoP*>c zTsnhLv7@4?aK)}d#xBUm1!&Mwrj z;v1d?F4iJft_CTNdeASyIGO0y5Fd>tP!jdU)h6#!qOhJtzB;vZ@I~RK?TVLMYVI}v*rQ+3-+uDBLDyZ literal 0 HcmV?d00001 diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..3c4d2ff 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 18; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -51,7 +51,7 @@ int main(int argc, char* argv[]) { printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan @@ -64,21 +64,21 @@ int main(int argc, char* argv[]) { printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); @@ -137,14 +137,14 @@ int main(int argc, char* argv[]) { printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); system("pause"); // stop Win32 console from closing on exit diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..4d369dd 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -1,4 +1,5 @@ #include "common.h" +#include void checkCUDAErrorFn(const char *msg, const char *file, int line) { cudaError_t err = cudaGetLastError(); @@ -24,6 +25,17 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx >= n) { + return; + } + + if (idata[idx] > 0) { + bools[idx] = 1; + } + else { + bools[idx] = 0; + } } /** @@ -33,6 +45,14 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx >= n) { + return; + } + + if (bools[idx] == 1) { + odata[indices[idx]] = idata[idx]; + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed..9d1cfca 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -10,6 +10,8 @@ #include #include +#define blockSize 512 + #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..372d212 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -20,6 +20,13 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int sum = 0; + odata[0] = 0; + sum += idata[0]; + for (int i = 1; i < n; i++) { + odata[i] = sum; + sum += idata[i]; + } timer().endCpuTimer(); } @@ -31,8 +38,16 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int count = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[count] = idata[i]; + count++; + } + } + timer().endCpuTimer(); - return -1; + return count; } /** @@ -43,8 +58,35 @@ namespace StreamCompaction { int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int* bools = (int *)malloc(n * sizeof(int)); + int* indices = (int*)malloc(n * sizeof(int)); + + for (int i = 0; i < n; i++) { + bools[i] = (idata[i] != 0) ? 1 : 0; + } + int sum = 0; + indices[0] = 0; + sum += bools[0]; + for (int i = 1; i < n; i++) { + indices[i] = sum; + sum += bools[i]; + } + memcpy(odata, indices, n * sizeof(int)); + int count = indices[n - 1]; + + for (int i = 0; i < n; i++) { + if (bools[i] == 1) { + odata[indices[i]] = idata[i]; + } + } + + free(bools); + free(indices); + + + timer().endCpuTimer(); - return -1; + return count; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..e445911 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -2,6 +2,9 @@ #include #include "common.h" #include "efficient.h" +#include +#include + namespace StreamCompaction { namespace Efficient { @@ -15,10 +18,93 @@ namespace StreamCompaction { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ + + __global__ void kernUpsweepStep(int n, int destStride, int srcStride, int *data) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + int actualIdx = (idx + 1) * destStride - 1; + if (actualIdx >= n) { + return; + } + data[actualIdx] += data[actualIdx - srcStride]; + } + + __global__ void kernDownsweepStep(int n, int destStride, int srcStride, int* data) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + int actualIdx = (idx + 1) * destStride - 1; + if (actualIdx >= n) { + return; + } + int temp = data[actualIdx - srcStride]; + data[actualIdx - srcStride] = data[actualIdx]; + data[actualIdx] += temp; + } + + void scanWithoutTimer(int n, dim3 blocksPerGrid, int* dev_data) { + // TODO + + for (int d = 0; d <= ilog2ceil(n); d++) { + kernUpsweepStep << > > (n, std::pow(2, d + 1), std::pow(2, d), dev_data); + cudaDeviceSynchronize(); + } + + int zero = 0; + cudaMemcpy(dev_data + n - 1, &zero, sizeof(int), cudaMemcpyHostToDevice); + + for (int d = ilog2ceil(n); d >= 0; d--) { + kernDownsweepStep << > > (n, std::pow(2, d + 1), std::pow(2, d), dev_data); + cudaDeviceSynchronize(); + } + } + + int closestPower(int num) { + int i = 0; + while (num > std::pow(2, i)) { + i++; + } + return std::pow(2, i); + } + + int* zeros(int num) { + int *arr = (int*)malloc(num * sizeof(int)); + for (int i = 0; i < num; i++) { + arr[i] = 0; + } + return arr; + } + void scan(int n, int *odata, const int *idata) { + int nPot = closestPower(n); + + dim3 fullBlocksPerGrid((nPot + blockSize - 1) / blockSize); + + int* dev_data; + + cudaMalloc((void**)&dev_data, nPot * sizeof(int)); + checkCUDAError("Error during cudaMalloc dev_data"); + + cudaMemcpy(dev_data + nPot - n, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("Error during cudaMemcpy idata ==> dev_data"); + + int* zero = zeros(n); + + cudaMemcpy(dev_data, zero, (nPot - n) * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("Error during cudaMemcpy zero ==> dev_data"); + + cudaDeviceSynchronize(); + timer().startGpuTimer(); - // TODO + + scanWithoutTimer(nPot, fullBlocksPerGrid, dev_data); + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_data + nPot - n, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("Error during cudaMemcpy odata"); + + cudaFree(dev_data); + checkCUDAError("Error during cudaFree dev_data"); + + free(zero); } /** @@ -31,10 +117,71 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + int nPot = closestPower(n); + + dim3 fullBlocksPerGrid((nPot + blockSize - 1) / blockSize); + + int* dev_idata, * dev_bools, * dev_indices, int* dev_odata; + + cudaMalloc((void**)&dev_idata, nPot * sizeof(int)); + checkCUDAError("Error during cudaMalloc dev_idata"); + + cudaMalloc((void**)&dev_bools, nPot * sizeof(int)); + checkCUDAError("Error during cudaMalloc dev_bools"); + + cudaMalloc((void**)&dev_indices, nPot * sizeof(int)); + checkCUDAError("Error during cudaMalloc dev_indices"); + + cudaMalloc((void**)&dev_odata, nPot * sizeof(int)); + checkCUDAError("Error during cudaMalloc dev_odata"); + + cudaMemcpy(dev_idata + nPot - n, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("Error during cudaMemcpy dev_data"); + + int* zero = zeros(n); + + cudaMemcpy(dev_idata, zero, (nPot - n) * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("Error during cudaMemcpy zero ==> dev_data"); + + cudaDeviceSynchronize(); + timer().startGpuTimer(); - // TODO + //// TODO + // + + StreamCompaction::Common::kernMapToBoolean << > > (nPot, dev_bools, dev_idata); + + cudaMemcpy(dev_indices, dev_bools, nPot * sizeof(int), cudaMemcpyDeviceToDevice); + checkCUDAError("Error during cudaMemcpy dev_data"); + + scanWithoutTimer(nPot, fullBlocksPerGrid, dev_indices); + StreamCompaction::Common::kernScatter << > > (n, dev_odata + nPot - n, dev_idata + nPot - n, dev_bools + nPot - n, dev_indices + nPot - n); + timer().endGpuTimer(); - return -1; + + cudaMemcpy(odata, dev_odata + nPot - n, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("Error during cudaMemcpy dev_odata"); + + int count = 0; + int lastbool = 0; + cudaMemcpy(&count, dev_indices + nPot - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&lastbool, dev_bools + nPot - 1, sizeof(int), cudaMemcpyDeviceToHost); + + count += lastbool; + + cudaFree(dev_odata); + checkCUDAError("Error during cudaFree dev_odata"); + + cudaFree(dev_indices); + checkCUDAError("Error during cudaFree dev_indices"); + + cudaFree(dev_bools); + checkCUDAError("Error during cudaFree dev_bools"); + + cudaFree(dev_idata); + checkCUDAError("Error during cudaFree dev_idata"); + + return count; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..2ec0f1d 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -1,7 +1,11 @@ #include #include +#include #include "common.h" #include "naive.h" +#include +#include +#include namespace StreamCompaction { namespace Naive { @@ -12,14 +16,63 @@ namespace StreamCompaction { return timer; } // TODO: __global__ - + + __global__ void kernScanStep(int n, int stride, int* idata, int* odata) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx >= n) { + return; + } + odata[idx] = idata[idx]; + if (idx < stride) { + return; + } + odata[idx] += idata[idx - stride]; + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + int* dev_idata, * dev_odata; + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAError("Error during cudaMalloc dev_idata"); + + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAError("Error during cudaMalloc dev_odata"); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("Error during cudaMemcpy dev_idata"); + + cudaDeviceSynchronize(); + // TODO + + timer().startGpuTimer(); + + for (int d = 0; d <= ilog2ceil(n); d++) { + kernScanStep << > > (n, std::pow(2, d), dev_idata, dev_odata); + cudaDeviceSynchronize(); + + cudaMemcpy(dev_idata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToDevice); + checkCUDAError("Error during cudaMemcpy dev_odata ==> dev_idata"); + } + timer().endGpuTimer(); + + cudaMemcpy(odata + 1, dev_odata, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("Error during cudaMemcpy odata"); + + cudaFree(dev_idata); + checkCUDAError("Error during cudaFree dev_idata"); + cudaFree(dev_odata); + checkCUDAError("Error during cudaFree dev_odata"); + + + cudaDeviceSynchronize(); } + } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..0d34998 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,37 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + int* dev_idata, * dev_odata; + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAError("Error during cudaMalloc dev_idata"); + + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAError("Error during cudaMalloc dev_odata"); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("Error during cudaMemcpy idata --> dev_idata"); + + thrust::device_ptr dev_thrust_idata = thrust::device_ptr(dev_idata); + thrust::device_ptr dev_thrust_odata = thrust::device_ptr(dev_odata); + timer().startGpuTimer(); + + + thrust::exclusive_scan(dev_thrust_idata, dev_thrust_idata + n, dev_thrust_odata); + + // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("Error during cudaMemcpy dev_odata --> odata"); + + cudaFree(dev_idata); + cudaFree(dev_odata); } } }