diff --git a/README.md b/README.md index 05005d6..adbf8f1 100644 --- a/README.md +++ b/README.md @@ -30,60 +30,60 @@ $ cargo run --release -- ffobench ## Benchmark result -- Date: 2024/02/21 +- Date: 2024/03/29 - Hardware: AMD Ryzen 9 7950X3D, DDR5-4800 64GB -- Environment: Linux 6.5.0-18-generic, Ubuntu 22.04.4, rustc 1.78.0-nightly +- Environment: Linux 6.5.0-26-generic, Ubuntu 22.04.4, rustc 1.78.0-nightly FFO 40-59 |No.|empties|result|answer|move|nodes|time|NPS| |---:|---:|---:|---:|---:|---:|:--:|---:| -|40|20|+38|+38|A2|90.4M| 0.054s|1644M/s| -|41|22| +0| +0|H4| 117M| 0.092s|1267M/s| -|42|22| +6| +6|G2| 287M| 0.177s|1617M/s| -|43|23|-12|-12|C7| 159M| 0.121s|1303M/s| -|44|23|-14|-14|D2| 111M| 0.083s|1322M/s| -|45|24| +6| +6|B2|1.57G| 1.010s|1561M/s| -|46|24| -8| -8|B3| 494M| 0.349s|1412M/s| -|47|25| +4| +4|G2| 167M| 0.123s|1350M/s| -|48|25|+28|+28|F6| 901M| 0.680s|1323M/s| -|49|26|+16|+16|E1|3.10G| 1.988s|1559M/s| -|50|26|+10|+10|D8|3.56G| 2.773s|1285M/s| -|51|27| +6| +6|E2|1.47G| 1.240s|1192M/s| -|52|27| +0| +0|A3|1.31G| 1.080s|1215M/s| -|53|28| -2| -2|D8|5.77G| 4.829s|1196M/s| -|54|28| -2| -2|C7|15.6G| 11.420s|1368M/s| -|55|29| +0| +0|G6|29.2G| 28.391s|1030M/s| -|56|29| +2| +2|H5|4.56G| 4.727s|965M/s| -|57|30|-10|-10|A6|19.8G| 18.593s|1066M/s| -|58|30| +4| +4|G1|4.83G| 4.961s|973M/s| -|59|34|+64|+64|G8|1.66k| 0.034s|0M/s| - -[Total] elapsed: 82738454us, node count: 93254297417, NPS: 1127097412nodes/sec +|40|20|+38|+38|A2|76.2M| 0.062s|1209M/s| +|41|22| +0| +0|H4|87.2M| 0.080s|1076M/s| +|42|22| +6| +6|G2| 270M| 0.178s|1511M/s| +|43|23|-12|-12|C7| 164M| 0.146s|1122M/s| +|44|23|-14|-14|D2|84.3M| 0.081s|1029M/s| +|45|24| +6| +6|B2|1.51G| 0.978s|1547M/s| +|46|24| -8| -8|B3| 443M| 0.329s|1343M/s| +|47|25| +4| +4|G2| 133M| 0.122s|1084M/s| +|48|25|+28|+28|F6| 874M| 0.676s|1291M/s| +|49|26|+16|+16|E1|3.14G| 2.047s|1536M/s| +|50|26|+10|+10|D8|3.24G| 2.543s|1276M/s| +|51|27| +6| +6|E2|1.41G| 1.188s|1191M/s| +|52|27| +0| +0|A3|1.42G| 1.175s|1208M/s| +|53|28| -2| -2|D8|4.91G| 4.038s|1216M/s| +|54|28| -2| -2|C7|13.6G| 9.923s|1375M/s| +|55|29| +0| +0|G6|27.3G| 25.529s|1070M/s| +|56|29| +2| +2|H5|4.26G| 4.629s|921M/s| +|57|30|-10|-10|A6|19.6G| 18.350s|1071M/s| +|58|30| +4| +4|G1|4.48G| 4.760s|942M/s| +|59|34|+64|+64|G8|1.26k| 0.029s|0M/s| + +[Total] elapsed: 76872921us, node count: 87193095283, NPS: 1134249800nodes/sec FFO 60-79 |No.|empties|result|answer|move|nodes|time|NPS| |---:|---:|---:|---:|---:|---:|:--:|---:| -|60|24|+20|+20|C2| 216M| 0.164s|1313M/s| -|61|25|-14|-14|G1| 339M| 0.318s|1062M/s| -|62|27|+28|+28|E8|8.51G| 7.369s|1154M/s| -|63|27| -2| -2|F2|2.88G| 2.393s|1203M/s| -|64|27|+20|+20|B4|11.2G| 9.350s|1207M/s| -|65|28|+10|+10|G1|29.2G| 20.527s|1422M/s| -|66|28|+30|+30|H3|21.1G| 16.079s|1314M/s| -|67|28|+22|+22|H3|28.2G| 20.363s|1386M/s| -|68|30|+28|+28|E8| 139G| 107.049s|1301M/s| -|69|30| +0| +0|H3|15.7G| 14.081s|1121M/s| -|70|30|-24|-24|E3|14.4G| 13.097s|1106M/s| -|71|31|+20|+20|D2|20.1G| 19.281s|1044M/s| -|72|31|+24|+24|E1| 258G| 298.006s|867M/s| -|73|31| -4| -4|G4|29.4G| 34.672s|849M/s| -|74|31|-30|-30|F1| 611G| 563.534s|1084M/s| -|75|32|+14|+14|D2| 299G| 225.346s|1327M/s| -|76|32|+32|+32|A3|2.12T|1869.331s|1134M/s| -|77|34|+34|+34|B7|1.13T|1015.754s|1121M/s| -|78|34| +8| +8|F1| 673G| 844.518s|797M/s| -|79|36|+64|+64|D7|56.8G| 43.134s|1319M/s| - -[Total] elapsed: 5124378900us, node count: 5480838886501, NPS: 1069561598nodes/sec +|60|24|+20|+20|C2| 213M| 0.175s|1213M/s| +|61|25|-14|-14|G1| 344M| 0.325s|1058M/s| +|62|27|+28|+28|E8|7.61G| 6.587s|1155M/s| +|63|27| -2| -2|F2|2.89G| 2.383s|1215M/s| +|64|27|+20|+20|B4|10.4G| 8.675s|1204M/s| +|65|28|+10|+10|G1|26.6G| 18.953s|1405M/s| +|66|28|+30|+30|H3|19.7G| 15.215s|1299M/s| +|67|28|+22|+22|H3|25.8G| 18.823s|1375M/s| +|68|30|+28|+28|E8| 116G| 90.898s|1287M/s| +|69|30| +0| +0|H3|14.5G| 13.385s|1086M/s| +|70|30|-24|-24|E3|13.0G| 11.413s|1146M/s| +|71|31|+20|+20|D2|21.3G| 21.350s|999M/s| +|72|31|+24|+24|E1| 178G| 197.643s|902M/s| +|73|31| -4| -4|G4|25.8G| 31.323s|824M/s| +|74|31|-30|-30|F1| 599G| 580.678s|1031M/s| +|75|32|+14|+14|D2| 234G| 180.377s|1300M/s| +|76|32|+32|+32|A3|1.56T|1441.196s|1087M/s| +|77|34|+34|+34|B7|1.23T|1122.816s|1100M/s| +|78|34| +8| +8|F1| 571G| 831.075s|687M/s| +|79|36|+64|+64|D7|14.6G| 12.269s|1191M/s| + +[Total] elapsed: 4605575684us, node count: 4685891940872, NPS: 1017438918nodes/sec diff --git a/src/book.rs b/src/book.rs index 9e27545..6dd954b 100644 --- a/src/book.rs +++ b/src/book.rs @@ -178,7 +178,7 @@ fn search( solve_obj.cache_gen += 1; if board.empty().count_ones() <= 18 { let mut solve_obj = solve_obj.clone(); - solve_with_move(board, &mut solve_obj, &sub_solver.clone()) + solve_with_move(board, &mut solve_obj, &sub_solver.clone(), None) } else { let start = Instant::now(); let timer = Timer { diff --git a/src/engine/bits.rs b/src/engine/bits.rs index c92f3c7..a25ab54 100644 --- a/src/engine/bits.rs +++ b/src/engine/bits.rs @@ -7,12 +7,12 @@ pub trait BitManip { } impl BitManip for u64 { - #[cfg(target_feature = "avx2")] + #[cfg(all(target_feature = "bmi2", not(slow_pext)))] fn pext(&self, mask: u64) -> u64 { unsafe { _pext_u64(*self, mask) } } - #[cfg(not(target_feature = "avx2"))] + #[cfg(not(all(target_feature = "bmi2", not(slow_pext))))] fn pext(&self, mut mask: u64) -> u64 { let mut x = *self; x = x & mask; diff --git a/src/engine/board.rs b/src/engine/board.rs index 5585e6b..ac4bdf2 100644 --- a/src/engine/board.rs +++ b/src/engine/board.rs @@ -1,11 +1,11 @@ #[cfg(test)] mod test; -#[cfg(target_feature = "neon")] -use std::arch::aarch64::*; use crate::engine::bits::*; use crate::engine::hand::*; use anyhow::Result; use clap::ArgMatches; +#[cfg(target_feature = "neon")] +use std::arch::aarch64::*; use std::cmp::min; use std::fmt; use std::io::{BufWriter, Write}; @@ -55,7 +55,10 @@ fn smart_upper_bit(x: u64x4) -> u64x4 { } } -#[cfg(not(any(target_feature = "neon", all(target_feature = "avx512cd", target_feature = "avx512vl"))))] +#[cfg(not(any( + target_feature = "neon", + all(target_feature = "avx512cd", target_feature = "avx512vl") +)))] fn smart_upper_bit(mut x: u64x4) -> u64x4 { x |= x >> u64x4::from_array([8, 1, 7, 9]); x |= x >> u64x4::from_array([16, 2, 14, 18]); diff --git a/src/engine/endgame.rs b/src/engine/endgame.rs index d0448a1..abccaa6 100644 --- a/src/engine/endgame.rs +++ b/src/engine/endgame.rs @@ -7,7 +7,9 @@ use crate::engine::hand::*; use crate::engine::search::*; use crate::engine::table::*; use arrayvec::ArrayVec; +use crc64::Crc64; use std::cmp::max; +use std::io::Write; fn near_leaf(solve_obj: &mut SolveObj, board: Board) -> (i8, SolveStat) { let (score, node_count) = solve_obj.last_cache.solve_last(board); @@ -211,8 +213,30 @@ pub fn solve_inner( CutType::LessThanAlpha(v) => return (v, SolveStat::one_stcut()), } } - if rem < solve_obj.params.res_cache_limit { + if rem < solve_obj.params.local_res_cache_limit { fastest_first(solve_obj, board, (alpha, beta), passed) + } else if rem < solve_obj.params.res_cache_limit { + let mut crc64 = Crc64::new(); + crc64.write(&board.player.to_le_bytes()).unwrap(); + crc64.write(&board.opponent.to_le_bytes()).unwrap(); + let hash = crc64.get(); + let res_cache = solve_obj.local_res_cache.get(board, hash); + let lookup_result = make_lookup_result(res_cache, (&mut alpha, &mut beta)); + let (lower, upper) = match lookup_result { + CacheLookupResult::Cut(v) => return (v, SolveStat::zero()), + CacheLookupResult::NoCut(l, u, _) => (l, u), + }; + let (res, stat) = fastest_first(solve_obj, board, (alpha, beta), passed); + let record = make_record( + solve_obj.local_cache_gen, + board, + res, + None, + (alpha, beta), + (lower, upper), + ); + solve_obj.local_res_cache.update(&record, hash); + (res, stat) } else if rem < solve_obj.params.eval_ordering_limit { let (lower, upper) = match lookup_table(solve_obj, board, (&mut alpha, &mut beta)) { CacheLookupResult::Cut(v) => return (v, SolveStat::zero()), diff --git a/src/engine/last_cache.rs b/src/engine/last_cache.rs index 7f5afa0..f74c8c9 100644 --- a/src/engine/last_cache.rs +++ b/src/engine/last_cache.rs @@ -63,23 +63,23 @@ impl LastCache { } } - #[cfg(target_feature = "bmi2")] + #[cfg(all(target_feature = "bmi2", not(slow_pext)))] fn get_col_bits(bits: u64, mask: u64, _col: usize) -> u64 { bits.pext(mask) } - #[cfg(not(target_feature = "bmi2"))] + #[cfg(not(all(target_feature = "bmi2", not(slow_pext))))] fn get_col_bits(mut bits: u64, mask: u64, col: usize) -> u64 { bits &= mask; ((bits >> col).wrapping_mul(0x0002_0408_1020_4081) >> 49) & 0xff } - #[cfg(target_feature = "bmi2")] + #[cfg(all(target_feature = "bmi2", not(slow_pext)))] fn get_diag1_bits(bits: u64, mask: u64, _row: usize, _col: usize) -> u64 { bits.pext(mask) } - #[cfg(not(target_feature = "bmi2"))] + #[cfg(not(all(target_feature = "bmi2", not(slow_pext))))] fn get_diag1_bits(mut bits: u64, mask: u64, row: usize, col: usize) -> u64 { bits &= mask; let width = if row >= col { @@ -92,12 +92,12 @@ impl LastCache { (bits.wrapping_mul(0x0101_0101_0101_0101) >> 56) & ((1 << width) - 1) } - #[cfg(target_feature = "bmi2")] + #[cfg(all(target_feature = "bmi2", not(slow_pext)))] fn get_diag2_bits(bits: u64, mask: u64, _row: usize, _col: usize) -> u64 { bits.pext(mask) } - #[cfg(not(target_feature = "bmi2"))] + #[cfg(not(all(target_feature = "bmi2", not(slow_pext))))] fn get_diag2_bits(mut bits: u64, mask: u64, row: usize, col: usize) -> u64 { bits &= mask; let width = if row + col >= 7 { diff --git a/src/engine/midgame.rs b/src/engine/midgame.rs index 22ed98c..c2c5124 100644 --- a/src/engine/midgame.rs +++ b/src/engine/midgame.rs @@ -100,6 +100,7 @@ fn simplified_abdada_intro( if depth >= ctx.solve_obj.params.parallel_depth_limit || rem < ctx.solve_obj.params.parallel_empties_limit { let (res, stat) = solve_inner(&mut ctx.solve_obj, board, (alpha, beta), passed); ctx.stats.merge(stat); + ctx.solve_obj.local_cache_gen += 1; return Some((res, None)); } ctx.stats.merge(SolveStat::one()); @@ -148,12 +149,14 @@ pub fn simplified_abdada( (alpha, beta): (i8, i8), passed: bool, depth: i8, + num_threads: Option, ) -> (i8, Option, SolveStat) { thread::scope(|s| { let mut handles = Vec::new(); let cs_hash = Arc::new(DashSet::new()); let finished = Arc::new(AtomicBool::new(false)); - for _ in 0..num_cpus::get() { + let num_threads = num_threads.unwrap_or(num_cpus::get()); + for _ in 0..num_threads { let solve_obj = solve_obj.clone(); let cs_hash = cs_hash.clone(); let finished = finished.clone(); diff --git a/src/engine/search.rs b/src/engine/search.rs index 331b9e4..23528e5 100644 --- a/src/engine/search.rs +++ b/src/engine/search.rs @@ -40,6 +40,7 @@ pub struct SearchParams { pub parallel_empties_limit: i8, pub eval_ordering_limit: i8, pub res_cache_limit: i8, + pub local_res_cache_limit: i8, pub stability_cut_limit: i8, pub ffs_ordering_limit: i8, pub static_ordering_limit: i8, @@ -48,10 +49,12 @@ pub struct SearchParams { pub struct SolveObj { pub res_cache: Arc, pub eval_cache: Arc, + pub local_res_cache: CacheArray, pub evaluator: Arc, pub last_cache: Arc, pub params: SearchParams, pub cache_gen: u32, + pub local_cache_gen: u32, } impl Clone for SolveObj { @@ -59,10 +62,12 @@ impl Clone for SolveObj { SolveObj:: { res_cache: self.res_cache.clone(), eval_cache: self.eval_cache.clone(), + local_res_cache: self.local_res_cache.clone(), evaluator: self.evaluator.clone(), last_cache: self.last_cache.clone(), params: self.params.clone(), cache_gen: self.cache_gen.clone(), + local_cache_gen: self.local_cache_gen.clone(), } } } @@ -78,10 +83,12 @@ impl SolveObj { SolveObj { res_cache, eval_cache, + local_res_cache: CacheArray::::new(65536), evaluator, last_cache: Arc::new(LastCache::new()), params, cache_gen, + local_cache_gen: 0, } } } @@ -294,14 +301,17 @@ pub fn solve( (alpha, beta): (i8, i8), passed: bool, depth: i8, + num_threads: Option, ) -> (i8, Option, SolveStat) { - simplified_abdada(solve_obj, board, (alpha, beta), passed, depth) + simplified_abdada(solve_obj, board, (alpha, beta), passed, depth, num_threads) } +// num_threads: number of searching threads, use number of cpus when None pub fn solve_with_move( board: Board, solve_obj: &mut SolveObj, _sub_solver: &Arc, + num_threads: Option, ) -> Hand { if let Some(best) = simplified_abdada( solve_obj, @@ -309,6 +319,7 @@ pub fn solve_with_move( (-(BOARD_SIZE as i8), BOARD_SIZE as i8), false, 0, + num_threads, ) .1 { @@ -318,7 +329,15 @@ pub fn solve_with_move( let mut result = -65; for pos in board.mobility() { let next = board.play(pos).unwrap(); - let res = -simplified_abdada(solve_obj, next, (-(BOARD_SIZE as i8), -result), false, 0).0; + let res = -simplified_abdada( + solve_obj, + next, + (-(BOARD_SIZE as i8), -result), + false, + 0, + num_threads, + ) + .0; if res > result { result = res; best_pos = Some(pos); diff --git a/src/engine/table.rs b/src/engine/table.rs index 835c9ec..619357c 100644 --- a/src/engine/table.rs +++ b/src/engine/table.rs @@ -124,13 +124,14 @@ impl CacheElement for ResCache { } } -struct CacheArray { +#[derive(Clone)] +pub struct CacheArray { ary: Vec, cycle: u64, } impl CacheArray { - fn new(size: usize) -> CacheArray { + pub fn new(size: usize) -> CacheArray { let dummy: T = Default::default(); CacheArray { ary: vec![dummy; size], @@ -138,7 +139,7 @@ impl CacheArray { } } - fn get(&self, board: Board, hash: u64) -> Option { + pub fn get(&self, board: Board, hash: u64) -> Option { let index = (hash % self.cycle) as usize; let elem = &self.ary[index]; if elem.has_key(board) { @@ -148,7 +149,7 @@ impl CacheArray { } } - fn update(&mut self, new_elem: &T, hash: u64) { + pub fn update(&mut self, new_elem: &T, hash: u64) { let index = (hash % self.cycle) as usize; let elem = &mut self.ary[index]; elem.update(new_elem); @@ -197,7 +198,7 @@ impl CacheTable { pub type EvalCacheTable = CacheTable; pub type ResCacheTable = CacheTable; -fn make_record( +pub fn make_record( gen: u32, board: Board, mut res: i8, diff --git a/src/main.rs b/src/main.rs index 1750971..e6c5c9e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -94,6 +94,7 @@ fn solve_ffo( (-(BOARD_SIZE as i8), BOARD_SIZE as i8), false, 0, + None, ); let end = start.elapsed(); let milli_seconds = end.as_millis() + 1; // ceil up, avoid zero-division diff --git a/src/play.rs b/src/play.rs index 795151d..8db72b9 100644 --- a/src/play.rs +++ b/src/play.rs @@ -71,7 +71,7 @@ pub fn play(matches: &ArgMatches) -> Board { best } else { let mut solve_obj = solve_obj.clone(); - solve_with_move(board.board, &mut solve_obj, &sub_solver) + solve_with_move(board.board, &mut solve_obj, &sub_solver, None) }; solve_obj.cache_gen += 1; best @@ -133,7 +133,7 @@ pub fn self_play(matches: &ArgMatches) -> Board { best } else { let mut solve_obj = solve_obj.clone(); - solve_with_move(board.board, &mut solve_obj, &sub_solver) + solve_with_move(board.board, &mut solve_obj, &sub_solver, None) }; solve_obj.cache_gen += 1; let hand = best; @@ -196,7 +196,7 @@ fn self_play_worker( best } else { let mut obj = solve_obj.clone(); - solve_with_move(board.board, &mut obj, &sub_solver) + solve_with_move(board.board, &mut obj, &sub_solver, Some(1)) }; solve_obj.cache_gen += 1; let hand = best; @@ -286,6 +286,7 @@ pub fn codingame(_matches: &ArgMatches) -> Result<(), Box parallel_empties_limit: 16, eval_ordering_limit: 15, res_cache_limit: 12, + local_res_cache_limit: 9, stability_cut_limit: 8, ffs_ordering_limit: 6, static_ordering_limit: 5, @@ -371,7 +372,7 @@ pub fn codingame(_matches: &ArgMatches) -> Result<(), Box best } else { let mut solve_obj = solve_obj.clone(); - solve_with_move(board.board, &mut solve_obj, &sub_solver) + solve_with_move(board.board, &mut solve_obj, &sub_solver, None) }; solve_obj.cache_gen += 1; match best { diff --git a/src/record.rs b/src/record.rs index 90a0328..cabdc66 100644 --- a/src/record.rs +++ b/src/record.rs @@ -28,7 +28,7 @@ impl Record { for i in 0..(l / 2) { let h = Hand::from_str(&record_str[(2 * i)..(2 * i + 2)])?; hands.push(h); - board = board.play_hand(h).ok_or(UnmovableError{})?; + board = board.play_hand(h).ok_or(UnmovableError {})?; } let score = if let Some(score) = splitted.get(1) { score.parse().unwrap() @@ -52,7 +52,7 @@ impl Record { }; for &h in &self.hands { res.push((board, h, score)); - board = board.play_hand(h).ok_or(UnmovableError{})?; + board = board.play_hand(h).ok_or(UnmovableError {})?; score = -score; } res.push((board, Hand::Pass, score)); diff --git a/src/remote.rs b/src/remote.rs index be15b05..627a727 100644 --- a/src/remote.rs +++ b/src/remote.rs @@ -51,6 +51,7 @@ async fn worker_body() -> Result<(), Box> { parallel_empties_limit: 16, eval_ordering_limit: 15, res_cache_limit: 12, + local_res_cache_limit: 9, stability_cut_limit: 8, ffs_ordering_limit: 6, static_ordering_limit: 5, diff --git a/src/setup.rs b/src/setup.rs index 4ad7338..b664d74 100644 --- a/src/setup.rs +++ b/src/setup.rs @@ -13,7 +13,8 @@ pub fn setup_default() -> SolveObj { parallel_depth_limit: 16, parallel_empties_limit: 16, eval_ordering_limit: 15, - res_cache_limit: 12, + res_cache_limit: 14, + local_res_cache_limit: 10, stability_cut_limit: 8, ffs_ordering_limit: 6, static_ordering_limit: 5,