Add logit bonus system

ZealanL · Oct 26, 2024 · a8d28f0 · a8d28f0
1 parent 628f6a3
commit a8d28f0
Show file tree

Hide file tree

Showing 4 changed files with 27 additions and 2 deletions.
diff --git a/RLGymPPO_CPP/src/private/RLGymPPO_CPP/PPO/DiscretePolicy.h b/RLGymPPO_CPP/src/private/RLGymPPO_CPP/PPO/DiscretePolicy.h
@@ -13,7 +13,9 @@ namespace RLGPC {
 		int inputAmount;
 		int actionAmount;
 		IList layerSizes;
+
 		float temperature;
+		torch::Tensor logitBonuses;
 
 		// Min probability that an action will be taken
 		constexpr static float ACTION_MIN_PROB = 1e-11;
@@ -25,8 +27,15 @@ namespace RLGPC {
 		void CopyTo(DiscretePolicy& to);
 
 		torch::Tensor GetOutput(torch::Tensor input) {
+			auto baseOutput = seq->forward(input) / temperature;
+
+			if (logitBonuses.defined()) {
+				auto outputRange = baseOutput.max() - baseOutput.min();
+				baseOutput = baseOutput + (logitBonuses * outputRange);
+			}
+
 			return torch::nn::functional::softmax(
-				seq->forward(input) / temperature,
+				baseOutput,
 				torch::nn::functional::SoftmaxFuncOptions(-1)
 			);
 		}

diff --git a/RLGymPPO_CPP/src/private/RLGymPPO_CPP/Threading/ThreadAgent.cpp b/RLGymPPO_CPP/src/private/RLGymPPO_CPP/Threading/ThreadAgent.cpp
@@ -79,9 +79,15 @@ void _RunFunc(ThreadAgent* ta) {
 		// Infer the policy to get actions for all our agents in all our games
 		Timer policyInferTimer = {};
 
+
 		if (blockConcurrentInfer)
 			mgr->inferMutex.lock();
-		auto actionResults = policy->GetAction(curObsTensorDevice, deterministic);
+		RLGPC::DiscretePolicy::ActionResult actionResults;
+		try {
+			actionResults = policy->GetAction(curObsTensorDevice, deterministic);
+		} catch (std::exception& e) {
+			RG_ERR_CLOSE("Exception during policy->GetAction(): " << e.what());
+		}
 		if (blockConcurrentInfer)
 			mgr->inferMutex.unlock();
 		if (halfPrec) {

diff --git a/RLGymPPO_CPP/src/public/RLGymPPO_CPP/Learner.cpp b/RLGymPPO_CPP/src/public/RLGymPPO_CPP/Learner.cpp
@@ -1,4 +1,5 @@
 #include "Learner.h"
+#include "Learner.h"
 
 #include "../../private/RLGymPPO_CPP/Util/SkillTracker.h"
 
@@ -706,6 +707,13 @@ void RLGPC::Learner::UpdateLearningRates(float policyLR, float criticLR) {
 	ppo->UpdateLearningRates(policyLR, criticLR);
 }
 
+void RLGPC::Learner::SetLogitBonuses(RLGSC::FList bonuses) {
+	RG_ASSERT(bonuses.size() == actionAmount);
+	ppo->policy->logitBonuses = torch::tensor(bonuses, ppo->policy->device);
+	if (ppo->policyHalf)
+		ppo->policyHalf->logitBonuses = torch::tensor(bonuses, ppo->policy->device);
+}
+
 std::vector<RLGPC::Report> RLGPC::Learner::GetAllGameMetrics() {
 	std::vector<Report> reports = {};
 

diff --git a/RLGymPPO_CPP/src/public/RLGymPPO_CPP/Learner.h b/RLGymPPO_CPP/src/public/RLGymPPO_CPP/Learner.h
@@ -41,6 +41,8 @@ namespace RLGPC {
 
 		void UpdateLearningRates(float policyLR, float criticLR);
 
+        void SetLogitBonuses(RLGSC::FList bonuses);
+
 		std::vector<Report> GetAllGameMetrics();
 
 		void Save();