From 78559a8911df57252a2fc099255b372108bceda0 Mon Sep 17 00:00:00 2001 From: Nicola Gatto Date: Fri, 7 Jun 2019 11:38:35 +0200 Subject: [PATCH] Implement ddpg --- pom.xml | 6 +- .../cnnarch/gluongenerator/CNNArch2Gluon.java | 74 +- .../gluongenerator/CNNTrain2Gluon.java | 70 +- .../ReinforcementConfigurationData.java | 168 ++- .../annotations/ArchitectureAdapter.java | 88 ++ .../CriticNetworkGenerationException.java | 7 + .../critic/CriticNetworkGenerationPair.java | 21 + .../critic/CriticNetworkGenerator.java | 211 +++ .../resources/templates/gluon/CNNCreator.ftl | 4 +- src/main/resources/templates/gluon/CNNNet.ftl | 2 +- .../templates/gluon/elements/Input.ftl | 6 +- .../templates/gluon/reinforcement/Trainer.ftl | 210 ++- .../gluon/reinforcement/agent/Agent.ftl | 1144 +++++++++++------ .../reinforcement/agent/ReplayMemory.ftl | 137 +- .../gluon/reinforcement/agent/Strategy.ftl | 172 +++ .../architecture/CriticArchitecture.ftl | 7 + .../reinforcement/environment/Environment.ftl | 70 +- .../reinforcement/params/DdpgAgentParams.ftl | 19 + .../reinforcement/params/DqnAgentParams.ftl | 23 + .../params/ReplayMemoryParams.ftl | 14 + .../reinforcement/params/StrategyParams.ftl | 39 + .../gluon/reinforcement/util/Logger.ftl | 95 ++ .../gluon/reinforcement/util/Util.ftl | 238 +++- .../gluongenerator/GenerationTest.java | 101 +- .../IntegrationPythonWrapperTest.java | 12 +- .../util/TrainedArchitectureMockFactory.java | 48 + .../resources/target_code/CNNNet_Alexnet.py | 8 +- .../CNNNet_CifarClassifierNetwork.py | 8 +- .../resources/target_code/CNNNet_VGG16.py | 8 +- .../CNNTrainer_reinforcementConfig1.py | 133 +- .../reinforcement_learning/action_policy.py | 73 -- .../reinforcement_learning/agent.py | 1144 +++++++++++------ .../reinforcement_learning/cnnarch_logger.py | 95 ++ .../reinforcement_learning/environment.py | 20 +- .../reinforcement_learning/replay_memory.py | 137 +- .../reinforcement_learning/strategy.py | 172 +++ .../reinforcement_learning/util.py | 238 +++- .../CNNTrainer_reinforcementConfig2.py | 149 ++- .../reinforcement_learning/action_policy.py | 73 -- .../reinforcement_learning/agent.py | 1144 +++++++++++------ .../reinforcement_learning/cnnarch_logger.py | 95 ++ .../reinforcement_learning/environment.py | 7 - .../reinforcement_learning/replay_memory.py | 137 +- .../reinforcement_learning/strategy.py | 172 +++ .../reinforcement_learning/util.py | 238 +++- .../CNNTrainer_reinforcementConfig3.py | 117 ++ .../reinforcement_learning/__init__.py | 0 .../reinforcement_learning/agent.py | 900 +++++++++++++ .../reinforcement_learning/cnnarch_logger.py | 95 ++ .../reinforcement_learning/environment.py | 134 ++ .../reinforcement_learning/replay_memory.py | 208 +++ .../reinforcement_learning/strategy.py | 172 +++ .../reinforcement_learning/util.py | 276 ++++ .../ReinforcementConfig3/start_training.sh | 2 + .../ddpg/CNNTrainer_actorNetwork.py | 136 ++ .../CNNCreator_CriticNetwork.py | 56 + .../CNNNet_CriticNetwork.py | 119 ++ .../ddpg/reinforcement_learning/__init__.py | 0 .../ddpg/reinforcement_learning/agent.py | 900 +++++++++++++ .../reinforcement_learning/cnnarch_logger.py | 95 ++ .../reinforcement_learning/environment.py | 64 + .../reinforcement_learning/replay_memory.py | 208 +++ .../ddpg/reinforcement_learning/strategy.py | 172 +++ .../ddpg/reinforcement_learning/util.py | 276 ++++ .../target_code/ddpg/start_training.sh | 2 + .../reinforcement_learning/action_policy.py | 73 -- .../reinforcement_learning/strategy.py | 172 +++ .../ros-ddpg/CNNTrainer_rosActorNetwork.py | 141 ++ .../CNNCreator_RosCriticNetwork.py | 56 + .../CNNNet_RosCriticNetwork.py | 119 ++ .../reinforcement_learning/__init__.py | 0 .../ros-ddpg/reinforcement_learning/agent.py | 900 +++++++++++++ .../reinforcement_learning/cnnarch_logger.py | 95 ++ .../reinforcement_learning/environment.py | 149 +++ .../reinforcement_learning/replay_memory.py | 208 +++ .../reinforcement_learning/strategy.py | 172 +++ .../ros-ddpg/reinforcement_learning/util.py | 276 ++++ .../target_code/ros-ddpg/start_training.sh | 2 + .../valid_tests/ReinforcementConfig1.cnnt | 2 +- .../valid_tests/ReinforcementConfig2.cnnt | 5 +- .../valid_tests/ReinforcementConfig3.cnnt | 44 + .../valid_tests/ddpg-ros/RosActorNetwork.cnnt | 66 + .../ddpg-ros/comp/RosCriticNetwork.cnna | 14 + .../ddpg-ros/reward/RewardFunction.emadl | 15 + .../valid_tests/ddpg/ActorNetwork.cnnt | 61 + .../valid_tests/ddpg/comp/CriticNetwork.cnna | 14 + 86 files changed, 11591 insertions(+), 1962 deletions(-) create mode 100644 src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/annotations/ArchitectureAdapter.java create mode 100644 src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/reinforcement/critic/CriticNetworkGenerationException.java create mode 100644 src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/reinforcement/critic/CriticNetworkGenerationPair.java create mode 100644 src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/reinforcement/critic/CriticNetworkGenerator.java create mode 100644 src/main/resources/templates/gluon/reinforcement/agent/Strategy.ftl create mode 100644 src/main/resources/templates/gluon/reinforcement/architecture/CriticArchitecture.ftl create mode 100644 src/main/resources/templates/gluon/reinforcement/params/DdpgAgentParams.ftl create mode 100644 src/main/resources/templates/gluon/reinforcement/params/DqnAgentParams.ftl create mode 100644 src/main/resources/templates/gluon/reinforcement/params/ReplayMemoryParams.ftl create mode 100644 src/main/resources/templates/gluon/reinforcement/params/StrategyParams.ftl create mode 100644 src/main/resources/templates/gluon/reinforcement/util/Logger.ftl create mode 100644 src/test/java/de/monticore/lang/monticar/cnnarch/gluongenerator/util/TrainedArchitectureMockFactory.java delete mode 100644 src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/action_policy.py create mode 100644 src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/cnnarch_logger.py create mode 100644 src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/strategy.py delete mode 100644 src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/action_policy.py create mode 100644 src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/cnnarch_logger.py create mode 100644 src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/strategy.py create mode 100644 src/test/resources/target_code/ReinforcementConfig3/CNNTrainer_reinforcementConfig3.py create mode 100644 src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/__init__.py create mode 100644 src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/agent.py create mode 100644 src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/cnnarch_logger.py create mode 100644 src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/environment.py create mode 100644 src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/replay_memory.py create mode 100644 src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/strategy.py create mode 100644 src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/util.py create mode 100644 src/test/resources/target_code/ReinforcementConfig3/start_training.sh create mode 100644 src/test/resources/target_code/ddpg/CNNTrainer_actorNetwork.py create mode 100644 src/test/resources/target_code/ddpg/reinforcement_learning/CNNCreator_CriticNetwork.py create mode 100644 src/test/resources/target_code/ddpg/reinforcement_learning/CNNNet_CriticNetwork.py create mode 100644 src/test/resources/target_code/ddpg/reinforcement_learning/__init__.py create mode 100644 src/test/resources/target_code/ddpg/reinforcement_learning/agent.py create mode 100644 src/test/resources/target_code/ddpg/reinforcement_learning/cnnarch_logger.py create mode 100644 src/test/resources/target_code/ddpg/reinforcement_learning/environment.py create mode 100644 src/test/resources/target_code/ddpg/reinforcement_learning/replay_memory.py create mode 100644 src/test/resources/target_code/ddpg/reinforcement_learning/strategy.py create mode 100644 src/test/resources/target_code/ddpg/reinforcement_learning/util.py create mode 100644 src/test/resources/target_code/ddpg/start_training.sh delete mode 100644 src/test/resources/target_code/reinforcement_learning/action_policy.py create mode 100644 src/test/resources/target_code/reinforcement_learning/strategy.py create mode 100644 src/test/resources/target_code/ros-ddpg/CNNTrainer_rosActorNetwork.py create mode 100644 src/test/resources/target_code/ros-ddpg/reinforcement_learning/CNNCreator_RosCriticNetwork.py create mode 100644 src/test/resources/target_code/ros-ddpg/reinforcement_learning/CNNNet_RosCriticNetwork.py create mode 100644 src/test/resources/target_code/ros-ddpg/reinforcement_learning/__init__.py create mode 100644 src/test/resources/target_code/ros-ddpg/reinforcement_learning/agent.py create mode 100644 src/test/resources/target_code/ros-ddpg/reinforcement_learning/cnnarch_logger.py create mode 100644 src/test/resources/target_code/ros-ddpg/reinforcement_learning/environment.py create mode 100644 src/test/resources/target_code/ros-ddpg/reinforcement_learning/replay_memory.py create mode 100644 src/test/resources/target_code/ros-ddpg/reinforcement_learning/strategy.py create mode 100644 src/test/resources/target_code/ros-ddpg/reinforcement_learning/util.py create mode 100644 src/test/resources/target_code/ros-ddpg/start_training.sh create mode 100644 src/test/resources/valid_tests/ReinforcementConfig3.cnnt create mode 100644 src/test/resources/valid_tests/ddpg-ros/RosActorNetwork.cnnt create mode 100644 src/test/resources/valid_tests/ddpg-ros/comp/RosCriticNetwork.cnna create mode 100644 src/test/resources/valid_tests/ddpg-ros/reward/RewardFunction.emadl create mode 100644 src/test/resources/valid_tests/ddpg/ActorNetwork.cnnt create mode 100644 src/test/resources/valid_tests/ddpg/comp/CriticNetwork.cnna diff --git a/pom.xml b/pom.xml index 0f5f083..325d855 100644 --- a/pom.xml +++ b/pom.xml @@ -8,7 +8,7 @@ de.monticore.lang.monticar cnnarch-gluon-generator - 0.2.0-SNAPSHOT + 0.2.1-SNAPSHOT @@ -16,8 +16,8 @@ 0.3.0-SNAPSHOT - 0.3.0-SNAPSHOT - 0.2.14-SNAPSHOT + 0.3.2-SNAPSHOT + 0.2.15-SNAPSHOT 0.1.4 0.0.1 diff --git a/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/CNNArch2Gluon.java b/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/CNNArch2Gluon.java index 2a6106f..4244087 100644 --- a/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/CNNArch2Gluon.java +++ b/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/CNNArch2Gluon.java @@ -20,11 +20,13 @@ */ package de.monticore.lang.monticar.cnnarch.gluongenerator; +import de.monticore.lang.monticar.cnnarch._symboltable.IOSymbol; import de.monticore.lang.monticar.cnnarch.mxnetgenerator.CNNArch2MxNet; import de.monticore.lang.monticar.cnnarch.mxnetgenerator.Target; import de.monticore.lang.monticar.cnnarch._symboltable.ArchitectureSymbol; import de.monticore.lang.monticar.cnnarch.mxnetgenerator.TemplateConfiguration; +import de.se_rwth.commons.logging.Log; import java.util.HashMap; import java.util.Map; @@ -34,35 +36,81 @@ public class CNNArch2Gluon extends CNNArch2MxNet { //check cocos with CNNArchCocos.checkAll(architecture) before calling this method. @Override public Map generateStrings(ArchitectureSymbol architecture){ - TemplateConfiguration templateConfiguration = new GluonTemplateConfiguration(); + Map fileContentMap = compileFileContentMap(architecture); + checkValidGeneration(architecture); + return fileContentMap; + } + + public Map generateStringsAllowMultipleIO(ArchitectureSymbol architecture, Boolean pythonFilesOnly) { + Map fileContentMap; + if (pythonFilesOnly) { + fileContentMap = compilePythonFilesOnlyContentMap(architecture); + } else { + fileContentMap = compileFileContentMap(architecture); + } + checkValidOutputTypes(architecture); + return fileContentMap; + } + private void checkValidOutputTypes(ArchitectureSymbol architecture) { + if (((IOSymbol)architecture.getOutputs().get(0)).getDefinition().getType().getWidth() != 1 + || ((IOSymbol)architecture.getOutputs().get(0)).getDefinition().getType().getHeight() != 1) { + Log.error("This cnn architecture has a multi-dimensional output, which is currently not supported by" + + " the code generator.", architecture.getSourcePosition()); + } + } + + private Map compilePythonFiles(CNNArch2GluonTemplateController controller, ArchitectureSymbol architecture) { Map fileContentMap = new HashMap<>(); - CNNArch2GluonTemplateController archTc = new CNNArch2GluonTemplateController( - architecture, templateConfiguration); Map.Entry temp; - temp = archTc.process("CNNPredictor", Target.CPP); - fileContentMap.put(temp.getKey(), temp.getValue()); - - temp = archTc.process("CNNNet", Target.PYTHON); + temp = controller.process("CNNNet", Target.PYTHON); fileContentMap.put(temp.getKey(), temp.getValue()); if (architecture.getDataPath() != null) { - temp = archTc.process("CNNDataLoader", Target.PYTHON); + temp = controller.process("CNNDataLoader", Target.PYTHON); fileContentMap.put(temp.getKey(), temp.getValue()); } - temp = archTc.process("CNNCreator", Target.PYTHON); + temp = controller.process("CNNCreator", Target.PYTHON); + fileContentMap.put(temp.getKey(), temp.getValue()); + + return fileContentMap; + } + + private Map compileCppFiles(CNNArch2GluonTemplateController controller) { + Map fileContentMap = new HashMap<>(); + Map.Entry temp; + + temp = controller.process("CNNPredictor", Target.CPP); fileContentMap.put(temp.getKey(), temp.getValue()); - temp = archTc.process("execute", Target.CPP); + temp = controller.process("execute", Target.CPP); fileContentMap.put(temp.getKey().replace(".h", ""), temp.getValue()); - temp = archTc.process("CNNBufferFile", Target.CPP); + temp = controller.process("CNNBufferFile", Target.CPP); fileContentMap.put("CNNBufferFile.h", temp.getValue()); - checkValidGeneration(architecture); + return fileContentMap; + } + + private Map compileFileContentMap(ArchitectureSymbol architecture) { + TemplateConfiguration templateConfiguration = new GluonTemplateConfiguration(); + + Map fileContentMap = new HashMap<>(); + CNNArch2GluonTemplateController archTc = new CNNArch2GluonTemplateController( + architecture, templateConfiguration); + + fileContentMap.putAll(compilePythonFiles(archTc, architecture)); + fileContentMap.putAll(compileCppFiles(archTc)); return fileContentMap; } -} + + private Map compilePythonFilesOnlyContentMap(ArchitectureSymbol architecture) { + TemplateConfiguration templateConfiguration = new GluonTemplateConfiguration(); + CNNArch2GluonTemplateController archTc = new CNNArch2GluonTemplateController( + architecture, templateConfiguration); + return compilePythonFiles(archTc, architecture); + } +} \ No newline at end of file diff --git a/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/CNNTrain2Gluon.java b/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/CNNTrain2Gluon.java index 980b517..6cdb621 100644 --- a/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/CNNTrain2Gluon.java +++ b/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/CNNTrain2Gluon.java @@ -1,6 +1,8 @@ package de.monticore.lang.monticar.cnnarch.gluongenerator; import com.google.common.collect.Maps; +import de.monticore.lang.monticar.cnnarch.gluongenerator.reinforcement.critic.CriticNetworkGenerationPair; +import de.monticore.lang.monticar.cnnarch.gluongenerator.reinforcement.critic.CriticNetworkGenerator; import de.monticore.lang.monticar.cnnarch.gluongenerator.reinforcement.FunctionParameterChecker; import de.monticore.lang.monticar.cnnarch.gluongenerator.reinforcement.RewardFunctionParameterAdapter; import de.monticore.lang.monticar.cnnarch.gluongenerator.reinforcement.RewardFunctionSourceGenerator; @@ -10,7 +12,9 @@ import de.monticore.lang.monticar.cnnarch.mxnetgenerator.CNNTrain2MxNet; import de.monticore.lang.monticar.cnnarch.mxnetgenerator.TemplateConfiguration; import de.monticore.lang.monticar.cnntrain._symboltable.ConfigurationSymbol; import de.monticore.lang.monticar.cnntrain._symboltable.LearningMethod; +import de.monticore.lang.monticar.cnntrain._symboltable.RLAlgorithm; import de.monticore.lang.monticar.cnntrain._symboltable.RewardFunctionSymbol; +import de.monticore.lang.monticar.cnntrain.annotations.TrainedArchitecture; import de.monticore.lang.monticar.generator.FileContent; import de.monticore.lang.monticar.generator.cpp.GeneratorCPP; import de.monticore.lang.monticar.generator.pythonwrapper.GeneratorPythonWrapperStandaloneApi; @@ -25,6 +29,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; +import java.util.stream.Collectors; public class CNNTrain2Gluon extends CNNTrain2MxNet { private static final String REINFORCEMENT_LEARNING_FRAMEWORK_MODULE = "reinforcement_learning"; @@ -62,6 +67,15 @@ public class CNNTrain2Gluon extends CNNTrain2MxNet { public void generate(Path modelsDirPath, String rootModelName) { ConfigurationSymbol configuration = this.getConfigurationSymbol(modelsDirPath, rootModelName); + if (configuration.getLearningMethod().equals(LearningMethod.REINFORCEMENT)) { + throw new IllegalStateException("Cannot call generate of reinforcement configuration without specifying " + + "the trained architecture"); + } + + generateFilesFromConfigurationSymbol(configuration); + } + + private void generateFilesFromConfigurationSymbol(ConfigurationSymbol configuration) { Map fileContents = this.generateStrings(configuration); GeneratorCPP genCPP = new GeneratorCPP(); genCPP.setGenerationTargetPath(this.getGenerationTargetPath()); @@ -78,6 +92,13 @@ public class CNNTrain2Gluon extends CNNTrain2MxNet { } } + public void generate(Path modelsDirPath, String rootModelName, TrainedArchitecture trainedArchitecture) { + ConfigurationSymbol configurationSymbol = this.getConfigurationSymbol(modelsDirPath, rootModelName); + configurationSymbol.setTrainedArchitecture(trainedArchitecture); + this.setRootProjectModelsDir(modelsDirPath.toString()); + generateFilesFromConfigurationSymbol(configurationSymbol); + } + @Override public Map generateStrings(ConfigurationSymbol configuration) { TemplateConfiguration templateConfiguration = new GluonTemplateConfiguration(); @@ -98,8 +119,30 @@ public class CNNTrain2Gluon extends CNNTrain2MxNet { fileContentMap.put("supervised_trainer.py", cnnSupervisedTrainerContent); } else if (configData.isReinforcementLearning()) { final String trainerName = "CNNTrainer_" + getInstanceName(); + final RLAlgorithm rlAlgorithm = configData.getRlAlgorithm(); + + if (rlAlgorithm.equals(RLAlgorithm.DDPG)) { + CriticNetworkGenerator criticNetworkGenerator = new CriticNetworkGenerator(); + criticNetworkGenerator.setGenerationTargetPath( + Paths.get(getGenerationTargetPath(), REINFORCEMENT_LEARNING_FRAMEWORK_MODULE).toString()); + if (getRootProjectModelsDir().isPresent()) { + criticNetworkGenerator.setRootModelsDir(getRootProjectModelsDir().get()); + } else { + Log.error("No root model dir set"); + } + + CriticNetworkGenerationPair criticNetworkResult + = criticNetworkGenerator.generateCriticNetworkContent(templateConfiguration, configuration); + + fileContentMap.putAll(criticNetworkResult.getFileContent().entrySet().stream().collect(Collectors.toMap( + k -> REINFORCEMENT_LEARNING_FRAMEWORK_MODULE + "/" + k.getKey(), + Map.Entry::getValue)) + ); + ftlContext.put("criticInstanceName", criticNetworkResult.getCriticNetworkName()); + } + ftlContext.put("trainerName", trainerName); - Map rlFrameworkContentMap = constructReinforcementLearningFramework(templateConfiguration, ftlContext); + Map rlFrameworkContentMap = constructReinforcementLearningFramework(templateConfiguration, ftlContext, rlAlgorithm); fileContentMap.putAll(rlFrameworkContentMap); final String reinforcementTrainerContent = templateConfiguration.processTemplate(ftlContext, "reinforcement/Trainer.ftl"); @@ -161,23 +204,34 @@ public class CNNTrain2Gluon extends CNNTrain2MxNet { } private Map constructReinforcementLearningFramework( - final TemplateConfiguration templateConfiguration, final Map ftlContext) { + final TemplateConfiguration templateConfiguration, + final Map ftlContext, + RLAlgorithm rlAlgorithm) { Map fileContentMap = Maps.newHashMap(); ftlContext.put("rlFrameworkModule", REINFORCEMENT_LEARNING_FRAMEWORK_MODULE); - final String reinforcementAgentContent = templateConfiguration.processTemplate(ftlContext, "reinforcement/agent/Agent.ftl"); + final String loggerContent = templateConfiguration.processTemplate(ftlContext, + "reinforcement/util/Logger.ftl"); + fileContentMap.put(REINFORCEMENT_LEARNING_FRAMEWORK_MODULE + "/cnnarch_logger.py", loggerContent); + + final String reinforcementAgentContent = templateConfiguration.processTemplate(ftlContext, + "reinforcement/agent/Agent.ftl"); fileContentMap.put(REINFORCEMENT_LEARNING_FRAMEWORK_MODULE + "/agent.py", reinforcementAgentContent); - final String reinforcementPolicyContent = templateConfiguration.processTemplate(ftlContext, "reinforcement/agent/ActionPolicy.ftl"); - fileContentMap.put(REINFORCEMENT_LEARNING_FRAMEWORK_MODULE + "/action_policy.py", reinforcementPolicyContent); + final String reinforcementStrategyContent = templateConfiguration.processTemplate( + ftlContext, "reinforcement/agent/Strategy.ftl"); + fileContentMap.put(REINFORCEMENT_LEARNING_FRAMEWORK_MODULE + "/strategy.py", reinforcementStrategyContent); - final String replayMemoryContent = templateConfiguration.processTemplate(ftlContext, "reinforcement/agent/ReplayMemory.ftl"); + final String replayMemoryContent = templateConfiguration.processTemplate( + ftlContext, "reinforcement/agent/ReplayMemory.ftl"); fileContentMap.put(REINFORCEMENT_LEARNING_FRAMEWORK_MODULE + "/replay_memory.py", replayMemoryContent); - final String environmentContent = templateConfiguration.processTemplate(ftlContext, "reinforcement/environment/Environment.ftl"); + final String environmentContent = templateConfiguration.processTemplate( + ftlContext, "reinforcement/environment/Environment.ftl"); fileContentMap.put(REINFORCEMENT_LEARNING_FRAMEWORK_MODULE + "/environment.py", environmentContent); - final String utilContent = templateConfiguration.processTemplate(ftlContext, "reinforcement/util/Util.ftl"); + final String utilContent = templateConfiguration.processTemplate( + ftlContext, "reinforcement/util/Util.ftl"); fileContentMap.put(REINFORCEMENT_LEARNING_FRAMEWORK_MODULE + "/util.py", utilContent); final String initContent = ""; diff --git a/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/ReinforcementConfigurationData.java b/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/ReinforcementConfigurationData.java index c5991e7..7c64ee0 100644 --- a/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/ReinforcementConfigurationData.java +++ b/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/ReinforcementConfigurationData.java @@ -1,13 +1,13 @@ package de.monticore.lang.monticar.cnnarch.gluongenerator; +import de.monticore.lang.monticar.cnnarch._symboltable.ArchitectureSymbol; import de.monticore.lang.monticar.cnnarch.gluongenerator.reinforcement.RewardFunctionParameterAdapter; import de.monticore.lang.monticar.cnnarch.mxnetgenerator.ConfigurationData; import de.monticore.lang.monticar.cnntrain._symboltable.*; +import de.monticore.lang.monticar.cnntrain.annotations.Range; +import de.monticore.lang.monticar.cnntrain.annotations.TrainedArchitecture; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Optional; +import java.util.*; /** * @@ -25,10 +25,19 @@ public class ReinforcementConfigurationData extends ConfigurationData { private static final String AST_ENTRY_AGENT_NAME = "agent_name"; private static final String AST_ENTRY_USE_DOUBLE_DQN = "use_double_dqn"; private static final String AST_ENTRY_LOSS = "loss"; - + private static final String AST_ENTRY_RL_ALGORITHM = "rl_algorithm"; private static final String AST_ENTRY_REPLAY_MEMORY = "replay_memory"; - private static final String AST_ENTRY_ACTION_SELECTION = "action_selection"; + private static final String AST_ENTRY_STRATEGY = "strategy"; private static final String AST_ENTRY_ENVIRONMENT = "environment"; + private static final String AST_ENTRY_START_TRAINING_AT = "start_training_at"; + private static final String AST_SOFT_TARGET_UPDATE_RATE = "soft_target_update_rate"; + private static final String AST_EVALUATION_SAMPLES = "evaluation_samples"; + + private static final String ENVIRONMENT_PARAM_REWARD_TOPIC = "reward_topic"; + private static final String ENVIRONMENT_ROS = "ros_interface"; + private static final String ENVIRONMENT_GYM = "gym"; + + private static final String STRATEGY_ORNSTEIN_UHLENBECK = "ornstein_uhlenbeck"; public ReinforcementConfigurationData(ConfigurationSymbol configuration, String instanceName) { super(configuration, instanceName); @@ -97,6 +106,67 @@ public class ReinforcementConfigurationData extends ConfigurationData { ? null : (Boolean)retrieveConfigurationEntryValueByKey(AST_ENTRY_USE_DOUBLE_DQN); } + public Double getSoftTargetUpdateRate() { + return !configurationContainsKey(AST_SOFT_TARGET_UPDATE_RATE) + ? null : (Double)retrieveConfigurationEntryValueByKey(AST_SOFT_TARGET_UPDATE_RATE); + } + + public Integer getStartTrainingAt() { + return !configurationContainsKey(AST_ENTRY_START_TRAINING_AT) + ? null : (Integer)retrieveConfigurationEntryValueByKey(AST_ENTRY_START_TRAINING_AT); + } + + public Integer getEvaluationSamples() { + return !configurationContainsKey(AST_EVALUATION_SAMPLES) + ? null : (Integer)retrieveConfigurationEntryValueByKey(AST_EVALUATION_SAMPLES); + } + + + + public RLAlgorithm getRlAlgorithm() { + if (!isReinforcementLearning()) { + return null; + } + return !configurationContainsKey(AST_ENTRY_RL_ALGORITHM) + ? RLAlgorithm.DQN : (RLAlgorithm)retrieveConfigurationEntryValueByKey(AST_ENTRY_RL_ALGORITHM); + } + + public String getInputNameOfTrainedArchitecture() { + if (!this.getConfiguration().getTrainedArchitecture().isPresent()) { + throw new IllegalStateException("No trained architecture set"); + } + TrainedArchitecture trainedArchitecture = getConfiguration().getTrainedArchitecture().get(); + // We allow only one input, the first one is the only input + return trainedArchitecture.getInputs().get(0); + } + + public String getOutputNameOfTrainedArchitecture() { + if (!this.getConfiguration().getTrainedArchitecture().isPresent()) { + throw new IllegalStateException("No trained architecture set"); + } + TrainedArchitecture trainedArchitecture = getConfiguration().getTrainedArchitecture().get(); + // We allow only one output, the first one is the only output + return trainedArchitecture.getOutputs().get(0); + } + + public List getStateDim() { + if (!this.getConfiguration().getTrainedArchitecture().isPresent()) { + return null; + } + final String inputName = getInputNameOfTrainedArchitecture(); + TrainedArchitecture trainedArchitecture = this.getConfiguration().getTrainedArchitecture().get(); + return trainedArchitecture.getDimensions().get(inputName); + } + + public List getActionDim() { + if (!this.getConfiguration().getTrainedArchitecture().isPresent()) { + return null; + } + final String outputName = getOutputNameOfTrainedArchitecture(); + TrainedArchitecture trainedArchitecture = this.getConfiguration().getTrainedArchitecture().get(); + return trainedArchitecture.getDimensions().get(outputName); + } + public String getLoss() { return !configurationContainsKey(AST_ENTRY_LOSS) ? null : retrieveConfigurationEntryValueByKey(AST_ENTRY_LOSS).toString(); @@ -106,8 +176,36 @@ public class ReinforcementConfigurationData extends ConfigurationData { return getMultiParamEntry(AST_ENTRY_REPLAY_MEMORY, "method"); } - public Map getActionSelection() { - return getMultiParamEntry(AST_ENTRY_ACTION_SELECTION, "method"); + public Map getStrategy() { + assert isReinforcementLearning(): "Strategy parameter only for reinforcement learning but called in a " + + " non reinforcement learning context"; + Map strategyParams = getMultiParamEntry(AST_ENTRY_STRATEGY, "method"); + if (strategyParams.get("method").equals(STRATEGY_ORNSTEIN_UHLENBECK)) { + assert getConfiguration().getTrainedArchitecture().isPresent(): "Architecture not present," + + " but reinforcement training"; + TrainedArchitecture trainedArchitecture = getConfiguration().getTrainedArchitecture().get(); + final String actionPortName = getOutputNameOfTrainedArchitecture(); + Range actionRange = trainedArchitecture.getRanges().get(actionPortName); + + if (actionRange.isLowerLimitInfinity() && actionRange.isUpperLimitInfinity()) { + strategyParams.put("action_low", null); + strategyParams.put("action_high", null); + } else if(!actionRange.isLowerLimitInfinity() && actionRange.isUpperLimitInfinity()) { + assert actionRange.getLowerLimit().isPresent(); + strategyParams.put("action_low", actionRange.getLowerLimit().get()); + strategyParams.put("action_high", null); + } else if (actionRange.isLowerLimitInfinity() && !actionRange.isUpperLimitInfinity()) { + assert actionRange.getUpperLimit().isPresent(); + strategyParams.put("action_low", null); + strategyParams.put("action_high", actionRange.getUpperLimit().get()); + } else { + assert actionRange.getLowerLimit().isPresent(); + assert actionRange.getUpperLimit().isPresent(); + strategyParams.put("action_low", actionRange.getLowerLimit().get()); + strategyParams.put("action_high", actionRange.getUpperLimit().get()); + } + } + return strategyParams; } public Map getEnvironment() { @@ -136,6 +234,16 @@ public class ReinforcementConfigurationData extends ConfigurationData { .getRewardFunctionParameter().orElse(null)); } + public boolean isDiscreteRlAlgorithm() { + assert isReinforcementLearning(); + return getRlAlgorithm().equals(RLAlgorithm.DQN); + } + + public boolean isContinuousRlAlgorithm() { + assert isReinforcementLearning(); + return getRlAlgorithm().equals(RLAlgorithm.DDPG); + } + public Map getRewardFunctionStateParameter() { if (!getRlRewardFunctionParameter().isPresent() || !getRlRewardFunctionParameter().get().getInputStateParameterName().isPresent()) { @@ -159,6 +267,50 @@ public class ReinforcementConfigurationData extends ConfigurationData { return getRlRewardFunctionParameter().get().getOutputParameterName().orElse(null); } + public String getCriticOptimizerName() { + if (!getConfiguration().getCriticOptimizer().isPresent()) { + return null; + } + return getConfiguration().getCriticOptimizer().get().getName(); + } + + public Map getCriticOptimizerParams() { + // get classes for single enum values + assert getConfiguration().getCriticOptimizer().isPresent(): + "Critic optimizer params called although, not present"; + List lrPolicyClasses = new ArrayList<>(); + for (LRPolicy enum_value: LRPolicy.values()) { + lrPolicyClasses.add(enum_value.getClass()); + } + + + + Map mapToStrings = new HashMap<>(); + Map optimizerParams = + getConfiguration().getCriticOptimizer().get().getOptimizerParamMap(); + for (Map.Entry entry : optimizerParams.entrySet()) { + String paramName = entry.getKey(); + String valueAsString = entry.getValue().toString(); + Class realClass = entry.getValue().getValue().getValue().getClass(); + if (realClass == Boolean.class) { + valueAsString = (Boolean) entry.getValue().getValue().getValue() ? "True" : "False"; + } else if (lrPolicyClasses.contains(realClass)) { + valueAsString = "'" + valueAsString + "'"; + } + mapToStrings.put(paramName, valueAsString); + } + return mapToStrings; + } + + public boolean hasRosRewardTopic() { + Map environmentParameters = getMultiParamEntry(AST_ENTRY_ENVIRONMENT, "environment"); + if (environmentParameters == null + || !environmentParameters.containsKey("environment")) { + return false; + } + return environmentParameters.containsKey(ENVIRONMENT_PARAM_REWARD_TOPIC); + } + private Map getMultiParamEntry(final String key, final String valueName) { if (!configurationContainsKey(key)) { return null; diff --git a/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/annotations/ArchitectureAdapter.java b/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/annotations/ArchitectureAdapter.java new file mode 100644 index 0000000..8f2d088 --- /dev/null +++ b/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/annotations/ArchitectureAdapter.java @@ -0,0 +1,88 @@ +package de.monticore.lang.monticar.cnnarch.gluongenerator.annotations; + +import de.monticore.lang.monticar.cnnarch._symboltable.ArchitectureSymbol; +import de.monticore.lang.monticar.cnnarch._symboltable.IOSymbol; +import de.monticore.lang.monticar.cnntrain.annotations.Range; +import de.monticore.lang.monticar.cnntrain.annotations.TrainedArchitecture; +import de.monticore.lang.monticar.ranges._ast.ASTRange; +import de.monticore.symboltable.CommonSymbol; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static com.google.common.base.Preconditions.checkNotNull; + +public class ArchitectureAdapter implements TrainedArchitecture { + private ArchitectureSymbol architectureSymbol; + + + public ArchitectureAdapter(final ArchitectureSymbol architectureSymbol) { + checkNotNull(architectureSymbol); + this.architectureSymbol = architectureSymbol; + } + + @Override + public List getInputs() { + return getIOInputSymbols().stream() + .map(CommonSymbol::getName) + .collect(Collectors.toList()); + } + + @Override + public List getOutputs() { + return getIOOutputSymbols().stream() + .map(CommonSymbol::getName) + .collect(Collectors.toList()); + } + + @Override + public Map> getDimensions() { + return getAllIOSymbols().stream().collect(Collectors.toMap(CommonSymbol::getName, + s-> s.getDefinition().getType().getDimensions())); + } + + @Override + public Map getRanges() { + return getAllIOSymbols().stream().collect(Collectors.toMap(CommonSymbol::getName, + s -> astRangeToTrainRange(s.getDefinition().getType().getDomain().getRangeOpt().orElse(null)))); + } + + @Override + public Map getTypes() { + return getAllIOSymbols().stream().collect(Collectors.toMap(CommonSymbol::getName, + s -> s.getDefinition().getType().getDomain().getName())); + } + + private Range astRangeToTrainRange(final ASTRange range) { + if (range == null || (range.hasNoLowerLimit() && range.hasNoUpperLimit())) { + return Range.withInfinityLimits(); + } else if (range.hasNoUpperLimit() && !range.hasNoLowerLimit()) { + double lowerLimit = range.getStartValue().doubleValue(); + return Range.withUpperInfinityLimit(lowerLimit); + } else if (!range.hasNoUpperLimit() && range.hasNoLowerLimit()) { + double upperLimit = range.getEndValue().doubleValue(); + return Range.withLowerInfinityLimit(upperLimit); + } else { + double lowerLimit = range.getStartValue().doubleValue(); + double upperLimit = range.getEndValue().doubleValue(); + return Range.withLimits(lowerLimit, upperLimit); + } + } + + private List getIOOutputSymbols() { + return architectureSymbol.getOutputs(); + } + + private List getIOInputSymbols() { + return architectureSymbol.getInputs(); + } + + private List getAllIOSymbols() { + List ioSymbols = new ArrayList<>(); + ioSymbols.addAll(getIOOutputSymbols()); + ioSymbols.addAll(getIOInputSymbols()); + return ioSymbols; + } +} diff --git a/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/reinforcement/critic/CriticNetworkGenerationException.java b/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/reinforcement/critic/CriticNetworkGenerationException.java new file mode 100644 index 0000000..70389b6 --- /dev/null +++ b/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/reinforcement/critic/CriticNetworkGenerationException.java @@ -0,0 +1,7 @@ +package de.monticore.lang.monticar.cnnarch.gluongenerator.reinforcement.critic; + +public class CriticNetworkGenerationException extends RuntimeException { + public CriticNetworkGenerationException(String s) { + super("Generation of critic network failed: " + s); + } +} diff --git a/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/reinforcement/critic/CriticNetworkGenerationPair.java b/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/reinforcement/critic/CriticNetworkGenerationPair.java new file mode 100644 index 0000000..8ea2d62 --- /dev/null +++ b/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/reinforcement/critic/CriticNetworkGenerationPair.java @@ -0,0 +1,21 @@ +package de.monticore.lang.monticar.cnnarch.gluongenerator.reinforcement.critic; + +import java.util.Map; + +public class CriticNetworkGenerationPair { + private String criticNetworkName; + private Map fileContent; + + public CriticNetworkGenerationPair(String criticNetworkName, Map fileContent) { + this.criticNetworkName = criticNetworkName; + this.fileContent = fileContent; + } + + public String getCriticNetworkName() { + return criticNetworkName; + } + + public Map getFileContent() { + return fileContent; + } +} diff --git a/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/reinforcement/critic/CriticNetworkGenerator.java b/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/reinforcement/critic/CriticNetworkGenerator.java new file mode 100644 index 0000000..b03f7f1 --- /dev/null +++ b/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/reinforcement/critic/CriticNetworkGenerator.java @@ -0,0 +1,211 @@ +package de.monticore.lang.monticar.cnnarch.gluongenerator.reinforcement.critic; + +import com.google.common.collect.Lists; +import de.monticore.lang.monticar.cnnarch._symboltable.*; +import de.monticore.lang.monticar.cnnarch.gluongenerator.CNNArch2Gluon; +import de.monticore.lang.monticar.cnnarch.mxnetgenerator.CNNArchSymbolCompiler; +import de.monticore.lang.monticar.cnnarch.mxnetgenerator.TemplateConfiguration; +import de.monticore.lang.monticar.cnnarch.mxnetgenerator.checker.AllowAllLayerSupportChecker; +import de.monticore.lang.monticar.cnntrain._symboltable.ConfigurationSymbol; +import de.monticore.lang.monticar.cnntrain.annotations.Range; +import de.monticore.lang.monticar.cnntrain.annotations.TrainedArchitecture; +import de.se_rwth.commons.logging.Log; +import org.apache.commons.io.FileUtils; +import org.apache.commons.lang3.StringUtils; + +import java.io.IOException; +import java.nio.charset.Charset; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.*; + +import static com.google.common.base.Preconditions.checkState; + +public class CriticNetworkGenerator { + private static final String START_SEQUENCE = "implementationCritic(state,action)"; + + private String generationTargetPath; + private String rootModelsDir; + + protected String getGenerationTargetPath() { + return generationTargetPath; + } + + public void setGenerationTargetPath(String getGenerationTargetPath) { + this.generationTargetPath = getGenerationTargetPath; + } + + protected String getRootModelsDir() { + return rootModelsDir; + } + + public void setRootModelsDir(String rootModelsDir) { + this.rootModelsDir = rootModelsDir; + } + + public CriticNetworkGenerationPair generateCriticNetworkContent(TemplateConfiguration templateConfiguration, + ConfigurationSymbol configurationSymbol) { + checkState(getRootModelsDir() != null, "Root project directory is not set"); + checkState(getGenerationTargetPath() != null, "Target path is not set"); + + failIfArchitectureNotAvailable(configurationSymbol); + assert configurationSymbol.getTrainedArchitecture().isPresent(); + TrainedArchitecture trainedArchitecture = configurationSymbol.getTrainedArchitecture().get(); + failIfActorHasMultipleIO(trainedArchitecture); + + List criticNetwork = retrieveFullNameOfCriticsNetworkFromConfiguration(configurationSymbol); + final String criticNetworkName = criticNetwork.get(criticNetwork.size()-1); + final Path pathTocriticNetworkFile = retrievePathToCriticNetworkFileFromFullName(criticNetwork, criticNetworkName); + final String networkImplementation = parseNetworkImplementationFromFile(pathTocriticNetworkFile); + + Map context = makeTemplateContextMap(trainedArchitecture, criticNetworkName, networkImplementation); + Path directoryOfCnnArchFile = makeCnnArchFileFromContext(templateConfiguration, context); + + Map fileContentMap = generatePythonNetworkFiles(criticNetworkName, directoryOfCnnArchFile); + deleteOutputDirectory(directoryOfCnnArchFile); + + return new CriticNetworkGenerationPair(criticNetworkName, fileContentMap); + } + + private void deleteOutputDirectory(Path directoryOfCnnArchFile) { + try { + FileUtils.deleteDirectory(directoryOfCnnArchFile.toFile()); + } catch (IOException e) { + Log.warn("Cannot delete temporary CNN arch directory: " + directoryOfCnnArchFile.toString()); + } + } + + private Map generatePythonNetworkFiles(String criticNetworkName, Path directoryOfCnnArchFile) { + CNNArch2Gluon gluonGenerator = new CNNArch2Gluon(); + gluonGenerator.setGenerationTargetPath(this.getGenerationTargetPath()); + + Map fileContentMap = new HashMap<>(); + CNNArchSymbolCompiler symbolCompiler = new CNNArchSymbolCompiler(new AllowAllLayerSupportChecker()); + ArchitectureSymbol architectureSymbol = symbolCompiler.compileArchitectureSymbolFromModelsDir(directoryOfCnnArchFile, criticNetworkName); + architectureSymbol.setComponentName(criticNetworkName); + fileContentMap.putAll(gluonGenerator.generateStringsAllowMultipleIO(architectureSymbol, true)); + return fileContentMap; + } + + private Path makeCnnArchFileFromContext(TemplateConfiguration templateConfiguration, Map context) { + final String architectureContent = templateConfiguration.processTemplate( + context, "reinforcement/architecture/CriticArchitecture.ftl"); + + Path tmpDirectoryToArchFile = Paths.get(this.getGenerationTargetPath(), "tmp"); + try { + if (!tmpDirectoryToArchFile.toFile().exists()) { + Files.createDirectories(tmpDirectoryToArchFile); + } + Files.write( + Paths.get(tmpDirectoryToArchFile.toString(), context.get("architectureName") + ".cnna"), architectureContent.getBytes()); + } catch (IOException e) { + failWithMessage(e.getMessage()); + } + return tmpDirectoryToArchFile; + } + + private Map makeTemplateContextMap(TrainedArchitecture trainedArchitecture, String criticNetworkName, String networkImplementation) { + final String stateName = trainedArchitecture.getInputs().get(0); + final String actionName = trainedArchitecture.getOutputs().get(0); + + Map> dimensions = trainedArchitecture.getDimensions(); + List stateDimensions = dimensions.get(stateName); + List actionDimensions = dimensions.get(actionName); + + Map ranges = trainedArchitecture.getRanges(); + Range stateRange = ranges.get(stateName); + Range actionRange = ranges.get(actionName); + + Map types = trainedArchitecture.getTypes(); + final String stateType = types.get(stateName); + final String actionType = types.get(actionName); + + Map context = new HashMap<>(); + context.put("stateDimension", stateDimensions); + context.put("actionDimension", actionDimensions); + context.put("stateRange", stateRange); + context.put("actionRange", actionRange); + context.put("stateType", stateType); + context.put("actionType", actionType); + context.put("implementation", networkImplementation); + context.put("architectureName", criticNetworkName); + return context; + } + + private String parseNetworkImplementationFromFile(Path criticNetworkFile) { + String criticNetworkFileContent = null; + try { + criticNetworkFileContent = new String(Files.readAllBytes(criticNetworkFile), Charset.forName("UTF-8")); + } catch (IOException e) { + failWithMessage("Cannot create critic network file:" + e.getMessage()); + } + + String contentWhiteSpaceRemoved = criticNetworkFileContent.replaceAll("\\s+",""); + + if (!contentWhiteSpaceRemoved.contains(START_SEQUENCE) + || StringUtils.countMatches(contentWhiteSpaceRemoved, "{") != 1 + || StringUtils.countMatches(contentWhiteSpaceRemoved, "}") != 1 + || contentWhiteSpaceRemoved.charAt(contentWhiteSpaceRemoved.length() - 1) != '}') { + failWithMessage("Cannot parse critic file"); + } + + final int startOfNNImplementation = contentWhiteSpaceRemoved.indexOf("{") + 1; + final int endOfNNImplementation = contentWhiteSpaceRemoved.indexOf("}"); + return contentWhiteSpaceRemoved.substring(startOfNNImplementation, endOfNNImplementation); + } + + private Path retrievePathToCriticNetworkFileFromFullName(List criticNetwork, String criticNetworkName) { + // Add file ending cnna to file name + criticNetwork.set(criticNetwork.size()-1, criticNetworkName + ".cnna"); + + Path root = Paths.get(this.getRootModelsDir()); + Path criticNetworkFile = criticNetwork.stream().map(Paths::get).reduce(root, Path::resolve); + + if (!criticNetworkFile.toFile().exists()) { + failWithMessage("Critic network file does not exist in " + criticNetworkFile.toString()); + } + return criticNetworkFile; + } + + private List retrieveFullNameOfCriticsNetworkFromConfiguration(ConfigurationSymbol configurationSymbol) { + // Load critics network + failIfConfigurationHasNoCritic(configurationSymbol); + + assert configurationSymbol.getEntry("critic").getValue().getValue() instanceof String; + List criticNetwork = Lists.newArrayList(( + (String)configurationSymbol.getEntry("critic").getValue().getValue()).split("\\.")); + + // Check if file name is upper case otherwise make it upper case + int size = criticNetwork.size(); + if (Character.isLowerCase(criticNetwork.get(size-1).charAt(0))) { + String lowerCaseFileName = criticNetwork.get(size-1); + String upperCaseFileName = lowerCaseFileName.substring(0,1).toUpperCase() + lowerCaseFileName.substring(1); + criticNetwork.set(size-1, upperCaseFileName); + } + return criticNetwork; + } + + private void failIfConfigurationHasNoCritic(ConfigurationSymbol configurationSymbol) { + if (!configurationSymbol.getEntryMap().containsKey("critic")) { + failWithMessage("No critic network file given, but is required for selected algorithm"); + } + } + + private void failIfActorHasMultipleIO(TrainedArchitecture trainedArchitecture) { + if (trainedArchitecture.getInputs().size() > 1 || trainedArchitecture.getOutputs().size() > 1) { + failWithMessage("Actor component with multiple inputs or outputs is not supported by this generator"); + } + } + + private void failIfArchitectureNotAvailable(ConfigurationSymbol configurationSymbol) { + if (!configurationSymbol.getTrainedArchitecture().isPresent()) { + failWithMessage("No architecture symbol found but is required for selected algorithm"); + } + } + + private void failWithMessage(final String message) { + Log.error("Critic network generation failed: " + message); + throw new CriticNetworkGenerationException(message); + } +} \ No newline at end of file diff --git a/src/main/resources/templates/gluon/CNNCreator.ftl b/src/main/resources/templates/gluon/CNNCreator.ftl index 1a9c62f..583c397 100644 --- a/src/main/resources/templates/gluon/CNNCreator.ftl +++ b/src/main/resources/templates/gluon/CNNCreator.ftl @@ -6,7 +6,7 @@ from CNNNet_${tc.fullArchitectureName} import Net class ${tc.fileNameWithoutEnding}: _model_dir_ = "model/${tc.componentName}/" _model_prefix_ = "model" - _input_shapes_ = [<#list tc.architecture.inputs as input>(${tc.join(input.definition.type.dimensions, ",")},)] + _input_shapes_ = [<#list tc.architecture.inputs as input>(${tc.join(input.definition.type.dimensions, ",")},)<#if input?has_next>,] def __init__(self): self.weight_initializer = mx.init.Normal() @@ -48,7 +48,7 @@ class ${tc.fileNameWithoutEnding}: self.net = Net(data_mean=data_mean, data_std=data_std) self.net.collect_params().initialize(self.weight_initializer, ctx=context) self.net.hybridize() - self.net(mx.nd.zeros((1,)+self._input_shapes_[0], ctx=context)) + self.net(<#list tc.architecture.inputs as input>mx.nd.zeros((1,)+self._input_shapes_[${input?index}], ctx=context)<#if input?has_next>,) if not os.path.exists(self._model_dir_): os.makedirs(self._model_dir_) diff --git a/src/main/resources/templates/gluon/CNNNet.ftl b/src/main/resources/templates/gluon/CNNNet.ftl index c02d173..121fdaa 100644 --- a/src/main/resources/templates/gluon/CNNNet.ftl +++ b/src/main/resources/templates/gluon/CNNNet.ftl @@ -74,5 +74,5 @@ class Net(gluon.HybridBlock): with self.name_scope(): ${tc.include(tc.architecture.body, "ARCHITECTURE_DEFINITION")} - def hybrid_forward(self, F, x): + def hybrid_forward(self, F, <#list tc.architecture.inputs as input>${input}<#if input?has_next>, ): ${tc.include(tc.architecture.body, "FORWARD_FUNCTION")} \ No newline at end of file diff --git a/src/main/resources/templates/gluon/elements/Input.ftl b/src/main/resources/templates/gluon/elements/Input.ftl index d2a8688..11b7b24 100644 --- a/src/main/resources/templates/gluon/elements/Input.ftl +++ b/src/main/resources/templates/gluon/elements/Input.ftl @@ -2,11 +2,11 @@ <#if mode == "ARCHITECTURE_DEFINITION"> if not data_mean is None: assert(not data_std is None) - self.input_normalization = ZScoreNormalization(data_mean=data_mean, data_std=data_std) + self.${element.name}_input_normalization = ZScoreNormalization(data_mean=data_mean, data_std=data_std) else: - self.input_normalization = NoNormalization() + self.${element.name}_input_normalization = NoNormalization() <#if mode == "FORWARD_FUNCTION"> - ${element.name} = self.input_normalization(x) + ${element.name} = self.${element.name}_input_normalization(${element.name}) \ No newline at end of file diff --git a/src/main/resources/templates/gluon/reinforcement/Trainer.ftl b/src/main/resources/templates/gluon/reinforcement/Trainer.ftl index 6b28a79..e406907 100644 --- a/src/main/resources/templates/gluon/reinforcement/Trainer.ftl +++ b/src/main/resources/templates/gluon/reinforcement/Trainer.ftl @@ -1,35 +1,33 @@ <#setting number_format="computer"> <#assign config = configurations[0]> -from ${rlFrameworkModule}.agent import DqnAgent +<#assign rlAgentType=config.rlAlgorithm?switch("dqn", "DqnAgent", "ddpg", "DdpgAgent")> +from ${rlFrameworkModule}.agent import ${rlAgentType} from ${rlFrameworkModule}.util import AgentSignalHandler +from ${rlFrameworkModule}.cnnarch_logger import ArchLogger +<#if config.rlAlgorithm=="ddpg"> +from ${rlFrameworkModule}.CNNCreator_${criticInstanceName} import CNNCreator_${criticInstanceName} + import ${rlFrameworkModule}.environment import CNNCreator_${config.instanceName} import os import sys import re -import logging +import time +import numpy as np import mxnet as mx -session_output_dir = 'session' -<#if (config.agentName)??> -agent_name='${config.agentName}' -<#else> -agent_name='${config.instanceName}' - -session_param_output = os.path.join(session_output_dir, agent_name) -def resume_session(): - session_param_output = os.path.join(session_output_dir, agent_name) +def resume_session(sessions_dir): resume_session = False resume_directory = None - if os.path.isdir(session_output_dir) and os.path.isdir(session_param_output): + if os.path.isdir(sessions_dir): regex = re.compile(r'\d\d\d\d-\d\d-\d\d-\d\d-\d\d') - dir_content = os.listdir(session_param_output) + dir_content = os.listdir(sessions_dir) session_files = filter(regex.search, dir_content) session_files.sort(reverse=True) for d in session_files: - interrupted_session_dir = os.path.join(session_param_output, d, '.interrupted_session') + interrupted_session_dir = os.path.join(sessions_dir, d, '.interrupted_session') if os.path.isdir(interrupted_session_dir): resume = raw_input('Interrupted session from {} found. Do you want to resume? (y/n) '.format(d)) if resume == 'y': @@ -38,140 +36,130 @@ def resume_session(): break return resume_session, resume_directory + if __name__ == "__main__": +<#if (config.agentName)??> + agent_name = '${config.agentName}' +<#else> + agent_name = '${config.instanceName}' + + # Prepare output directory and logger + all_output_dir = os.path.join('model', agent_name) + output_directory = os.path.join( + all_output_dir, + time.strftime('%Y-%m-%d-%H-%M-%S', + time.localtime(time.time()))) + ArchLogger.set_output_directory(output_directory) + ArchLogger.set_logger_name(agent_name) + ArchLogger.set_output_level(ArchLogger.INFO) + <#if config.environment.environment == "gym"> env = ${rlFrameworkModule}.environment.GymEnvironment(<#if config.environment.name??>'${config.environment.name}'<#else>'CartPole-v0') <#else> env_params = { - 'ros_node_name' : '${config.instanceName}TrainerNode', + 'ros_node_name': '${config.instanceName}TrainerNode', <#if config.environment.state_topic??> - 'state_topic' : '${config.environment.state_topic}', + 'state_topic': '${config.environment.state_topic}', <#if config.environment.action_topic??> - 'action_topic' : '${config.environment.action_topic}', + 'action_topic': '${config.environment.action_topic}', <#if config.environment.reset_topic??> - 'reset_topic' : '${config.environment.reset_topic}', - -<#if config.environment.meta_topic??> - 'meta_topic' : '${config.environment.meta_topic}', - -<#if config.environment.greeting_topic??> - 'greeting_topic' : '${config.environment.greeting_topic}' + 'reset_topic': '${config.environment.reset_topic}', <#if config.environment.terminal_state_topic??> - 'terminal_state_topic' : '${config.environment.terminal_state_topic}' + 'terminal_state_topic': '${config.environment.terminal_state_topic}', + +<#if config.environment.reward_topic??> + 'reward_topic': '${config.environment.reward_topic}', } env = ${rlFrameworkModule}.environment.RosEnvironment(**env_params) + <#if (config.context)??> context = mx.${config.context}() <#else> context = mx.cpu() - net_creator = CNNCreator_${config.instanceName}.CNNCreator_${config.instanceName}() - net_creator.construct(context) - - replay_memory_params = { -<#if (config.replayMemory)??> - 'method':'${config.replayMemory.method}', -<#if (config.replayMemory.memory_size)??> - 'memory_size':${config.replayMemory.memory_size}, - -<#if (config.replayMemory.sample_size)??> - 'sample_size':${config.replayMemory.sample_size}, - +<#if config.rlAlgorithm == "dqn"> + qnet_creator = CNNCreator_${config.instanceName}.CNNCreator_${config.instanceName}() + qnet_creator.construct(context) <#else> - 'method':'online', + actor_creator = CNNCreator_${config.instanceName}.CNNCreator_${config.instanceName}() + actor_creator.construct(context) + critic_creator = CNNCreator_${criticInstanceName}() + critic_creator.construct(context) - 'state_dtype':'float32', - 'action_dtype':'uint8', - 'rewards_dtype':'float32' - } - policy_params = { -<#if (config.actionSelection)??> - 'method':'${config.actionSelection.method}', -<#else> - 'method':'epsgreedy' - -<#if (config.actionSelection.epsilon)??> - 'epsilon': ${config.actionSelection.epsilon}, - -<#if (config.actionSelection.min_epsilon)??> - 'min_epsilon': ${config.actionSelection.min_epsilon}, - -<#if (config.actionSelection.epsilon_decay_method)??> - 'epsilon_decay_method': '${config.actionSelection.epsilon_decay_method}', - -<#if (config.actionSelection.epsilon_decay)??> - 'epsilon_decay': ${config.actionSelection.epsilon_decay}, - - } - - resume_session, resume_directory = resume_session() - - if resume_session: - agent = DqnAgent.resume_from_session(resume_directory, net_creator.net, env) - else: - agent = DqnAgent( - network = net_creator.net, - environment=env, - replay_memory_params=replay_memory_params, - policy_params=policy_params, - state_dim=net_creator.get_input_shapes()[0], + agent_params = { + 'environment': env, + 'replay_memory_params': { +<#include "params/ReplayMemoryParams.ftl"> + }, + 'strategy_params': { +<#include "params/StrategyParams.ftl"> + }, + 'agent_name': agent_name, + 'verbose': True, + 'output_directory': output_directory, + 'state_dim': (<#list config.stateDim as d>${d},), + 'action_dim': (<#list config.actionDim as d>${d},), <#if (config.context)??> - ctx='${config.context}', + 'ctx': '${config.context}', <#if (config.discountFactor)??> - discount_factor=${config.discountFactor}, - -<#if (config.loss)??> - loss_function='${config.loss}', - -<#if (config.configuration.optimizer)??> - optimizer='${config.optimizerName}', - optimizer_params={ -<#list config.optimizerParams?keys as param> - '${param}': ${config.optimizerParams[param]}<#sep>, - - }, + 'discount_factor': ${config.discountFactor}, <#if (config.numEpisodes)??> - training_episodes=${config.numEpisodes}, + 'training_episodes': ${config.numEpisodes}, <#if (config.trainingInterval)??> - train_interval=${config.trainingInterval}, + 'train_interval': ${config.trainingInterval}, -<#if (config.useFixTargetNetwork)?? && config.useFixTargetNetwork> - use_fix_target=True, - target_update_interval=${config.targetNetworkUpdateInterval}, -<#else> - use_fix_target=False, - -<#if (config.useDoubleDqn)?? && config.useDoubleDqn> - double_dqn = True, -<#else> - double_dqn = False, +<#if (config.startTrainingAt)??> + 'start_training': ${config.startTrainingAt}, <#if (config.snapshotInterval)??> - snapshot_interval=${config.snapshotInterval}, + 'snapshot_interval': ${config.snapshotInterval}, - agent_name=agent_name, <#if (config.numMaxSteps)??> - max_episode_step=${config.numMaxSteps}, + 'max_episode_step': ${config.numMaxSteps}, + +<#if (config.evaluationSamples)??> + 'evaluation_samples': ${config.evaluationSamples}, + +<#if (config.outputDirectory)??> + 'output_directory': ${config.outputDirectory}, - output_directory=session_output_dir, - verbose=True, - live_plot = True, - make_logfile=True, <#if (config.targetScore)??> - target_score=${config.targetScore} + 'target_score': ${config.targetScore}, + +<#if (config.rlAlgorithm == "dqn")> +<#include "params/DqnAgentParams.ftl"> +<#else> +<#include "params/DdpgAgentParams.ftl"> + + } + + resume, resume_directory = resume_session(all_output_dir) + + if resume: + output_directory, _ = os.path.split(resume_directory) + ArchLogger.set_output_directory(output_directory) + resume_agent_params = { + 'session_dir': resume_directory, + 'environment': env, +<#if config.rlAlgorithm == "dqn"> + 'net': qnet_creator.net, <#else> - target_score=None + 'actor': actor_creator.net, + 'critic': critic_creator.net - ) + } + agent = ${rlAgentType}.resume_from_session(**resume_agent_params) + else: + agent = ${rlAgentType}(**agent_params) signal_handler = AgentSignalHandler() signal_handler.register_agent(agent) @@ -179,4 +167,8 @@ if __name__ == "__main__": train_successful = agent.train() if train_successful: - agent.save_best_network(net_creator._model_dir_ + net_creator._model_prefix_ + '_newest', epoch=0) \ No newline at end of file +<#if (config.rlAlgorithm == "dqn")> + agent.save_best_network(qnet_creator._model_dir_ + qnet_creator._model_prefix_ + '_newest', epoch=0) +<#else> + agent.save_best_network(actor_creator._model_dir_ + actor_creator._model_prefix_ + '_newest', epoch=0) + \ No newline at end of file diff --git a/src/main/resources/templates/gluon/reinforcement/agent/Agent.ftl b/src/main/resources/templates/gluon/reinforcement/agent/Agent.ftl index acbc974..c06b32a 100644 --- a/src/main/resources/templates/gluon/reinforcement/agent/Agent.ftl +++ b/src/main/resources/templates/gluon/reinforcement/agent/Agent.ftl @@ -2,118 +2,382 @@ import mxnet as mx import numpy as np import time import os -import logging import sys import util import matplotlib.pyplot as plt +import pyprind +from cnnarch_logger import ArchLogger from replay_memory import ReplayMemoryBuilder -from action_policy import ActionPolicyBuilder -from util import copy_net, get_loss_function +from strategy import StrategyBuilder +from util import copy_net, get_loss_function,\ + copy_net_with_two_inputs, DdpgTrainingStats, DqnTrainingStats,\ + make_directory_if_not_exist from mxnet import nd, gluon, autograd -class DqnAgent(object): - def __init__(self, - network, + +class Agent(object): + def __init__( + self, environment, replay_memory_params, - policy_params, + strategy_params, state_dim, + action_dim, ctx=None, discount_factor=.9, - loss_function='euclidean', - optimizer='rmsprop', - optimizer_params = {'learning_rate':0.09}, training_episodes=50, train_interval=1, - use_fix_target=False, - double_dqn = False, - target_update_interval=10, + start_training=0, snapshot_interval=200, - agent_name='Dqn_agent', + agent_name='Agent', max_episode_step=99999, + evaluation_samples=1000, output_directory='model_parameters', verbose=True, - live_plot = True, - make_logfile=True, - target_score=None): - assert 0 < discount_factor <= 1 - assert train_interval > 0 - assert target_update_interval > 0 - assert snapshot_interval > 0 - assert max_episode_step > 0 - assert training_episodes > 0 - assert replay_memory_params is not None - assert type(state_dim) is tuple - - self.__ctx = mx.gpu() if ctx == 'gpu' else mx.cpu() - self.__qnet = network - - self.__environment = environment - self.__discount_factor = discount_factor - self.__training_episodes = training_episodes - self.__train_interval = train_interval - self.__verbose = verbose - self.__state_dim = state_dim - self.__action_dim = self.__qnet(nd.random_normal(shape=((1,) + self.__state_dim), ctx=self.__ctx)).shape[1:] + target_score=None + ): + assert 0 < discount_factor <= 1,\ + 'Discount factor must be between 0 and 1' + assert train_interval > 0, 'Train interval must be greater 0' + assert snapshot_interval > 0, 'Snapshot interval must be greater 0' + assert max_episode_step > 0,\ + 'Maximal steps per episode must be greater 0' + assert training_episodes > 0, 'Trainings episode must be greater 0' + assert replay_memory_params is not None,\ + 'Replay memory parameter not set' + assert type(state_dim) is tuple, 'State dimension is not a tuple' + assert type(action_dim) is tuple, 'Action dimension is not a tuple' + + self._logger = ArchLogger.get_logger() + self._ctx = mx.gpu() if ctx == 'gpu' else mx.cpu() + self._environment = environment + self._discount_factor = discount_factor + self._training_episodes = training_episodes + self._train_interval = train_interval + self._verbose = verbose + self._state_dim = state_dim replay_memory_params['state_dim'] = state_dim - self.__replay_memory_params = replay_memory_params + replay_memory_params['action_dim'] = action_dim + self._replay_memory_params = replay_memory_params rm_builder = ReplayMemoryBuilder() - self.__memory = rm_builder.build_by_params(**replay_memory_params) - self.__minibatch_size = self.__memory.sample_size - - policy_params['action_dim'] = self.__action_dim - self.__policy_params = policy_params - p_builder = ActionPolicyBuilder() - self.__policy = p_builder.build_by_params(**policy_params) - - self.__target_update_interval = target_update_interval - self.__target_qnet = copy_net(self.__qnet, self.__state_dim, ctx=self.__ctx) - self.__loss_function_str = loss_function - self.__loss_function = get_loss_function(loss_function) - self.__agent_name = agent_name - self.__snapshot_interval = snapshot_interval - self.__creation_time = time.time() - self.__max_episode_step = max_episode_step - self.__optimizer = optimizer - self.__optimizer_params = optimizer_params - self.__make_logfile = make_logfile - self.__double_dqn = double_dqn - self.__use_fix_target = use_fix_target - self.__live_plot = live_plot - self.__user_given_directory = output_directory - self.__target_score = target_score - - self.__interrupt_flag = False + self._memory = rm_builder.build_by_params(**replay_memory_params) + self._minibatch_size = self._memory.sample_size + self._action_dim = action_dim + + strategy_params['action_dim'] = self._action_dim + self._strategy_params = strategy_params + strategy_builder = StrategyBuilder() + self._strategy = strategy_builder.build_by_params(**strategy_params) + self._agent_name = agent_name + self._snapshot_interval = snapshot_interval + self._creation_time = time.time() + self._max_episode_step = max_episode_step + self._start_training = start_training + self._output_directory = output_directory + self._target_score = target_score + + self._evaluation_samples = evaluation_samples + self._best_avg_score = -np.infty + self._best_net = None + + self._interrupt_flag = False + self._training_stats = None # Training Context - self.__current_episode = 0 - self.__total_steps = 0 - - # Initialize best network - self.__best_net = copy_net(self.__qnet, self.__state_dim, self.__ctx) - self.__best_avg_score = None + self._current_episode = 0 + self._total_steps = 0 - # Gluon Trainer definition - self.__training_stats = None + @property + def current_episode(self): + return self._current_episode - # Prepare output directory and logger - self.__output_directory = output_directory\ - + '/' + self.__agent_name\ - + '/' + time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(self.__creation_time)) - self.__logger = self.__setup_logging() - self.__logger.info('Agent created with following parameters: {}'.format(self.__make_config_dict())) + @property + def environment(self): + return self._environment - @classmethod - def from_config_file(cls, network, environment, config_file_path, ctx=None): + def save_config_file(self): import json - # Load config - with open(config_file_path, 'r') as config_file: - config_dict = json.load(config_file) - return cls(network, environment, ctx=ctx, **config_dict) + make_directory_if_not_exist(self._output_directory) + filename = os.path.join(self._output_directory, 'config.json') + config = self._make_config_dict() + with open(filename, mode='w') as fp: + json.dump(config, fp, indent=4) + + def set_interrupt_flag(self, interrupt): + self._interrupt_flag = interrupt + + def _interrupt_training(self): + import pickle + self._logger.info('Training interrupted; Store state for resuming') + session_dir = self._get_session_dir() + agent_session_file = os.path.join(session_dir, 'agent.p') + logger = self._logger + + self._make_pickle_ready(session_dir) + + with open(agent_session_file, 'wb') as f: + pickle.dump(self, f, protocol=2) + logger.info('State successfully stored') + + def _make_pickle_ready(self, session_dir): + del self._training_stats.logger + self._logger = None + self._environment.close() + self._environment = None + self._save_net(self._best_net, 'best_net', session_dir) + self._best_net = None + + def _make_config_dict(self): + config = dict() + config['state_dim'] = self._state_dim + config['action_dim'] = self._action_dim + config['ctx'] = str(self._ctx) + config['discount_factor'] = self._discount_factor + config['strategy_params'] = self._strategy_params + config['replay_memory_params'] = self._replay_memory_params + config['training_episodes'] = self._training_episodes + config['start_training'] = self._start_training + config['evaluation_samples'] = self._evaluation_samples + config['train_interval'] = self._train_interval + config['snapshot_interval'] = self._snapshot_interval + config['agent_name'] = self._agent_name + config['max_episode_step'] = self._max_episode_step + config['output_directory'] = self._output_directory + config['verbose'] = self._verbose + config['target_score'] = self._target_score + return config + + def _adjust_optimizer_params(self, optimizer_params): + if 'weight_decay' in optimizer_params: + optimizer_params['wd'] = optimizer_params['weight_decay'] + del optimizer_params['weight_decay'] + if 'learning_rate_decay' in optimizer_params: + min_learning_rate = 1e-8 + if 'learning_rate_minimum' in optimizer_params: + min_learning_rate = optimizer_params['learning_rate_minimum'] + del optimizer_params['learning_rate_minimum'] + optimizer_params['lr_scheduler'] = mx.lr_scheduler.FactorScheduler( + optimizer_params['step_size'], + factor=optimizer_params['learning_rate_decay'], + stop_factor_lr=min_learning_rate) + del optimizer_params['step_size'] + del optimizer_params['learning_rate_decay'] + + return optimizer_params + + def _sample_from_memory(self): + states, actions, rewards, next_states, terminals\ + = self._memory.sample(batch_size=self._minibatch_size) + states = nd.array(states, ctx=self._ctx) + actions = nd.array(actions, ctx=self._ctx) + rewards = nd.array(rewards, ctx=self._ctx) + next_states = nd.array(next_states, ctx=self._ctx) + terminals = nd.array(terminals, ctx=self._ctx) + return states, actions, rewards, next_states, terminals + + def evaluate(self, target=None, sample_games=100, verbose=True): + target = self._target_score if target is None else target + if target: + target_achieved = 0 + total_reward = 0 + + self._logger.info('Sampling from {} games...'.format(sample_games)) + for g in pyprind.prog_bar(range(sample_games)): + state = self._environment.reset() + step = 0 + game_reward = 0 + terminal = False + while not terminal and (step < self._max_episode_step): + action = self.get_next_action(state) + state, reward, terminal, _ = self._environment.step(action) + game_reward += reward + step += 1 + + if verbose: + info = 'Game %d: Reward %f' % (g, game_reward) + self._logger.debug(info) + if target: + if game_reward >= target: + target_achieved += 1 + total_reward += game_reward + + avg_reward = float(total_reward)/float(sample_games) + info = 'Avg. Reward: %f' % avg_reward + if target: + target_achieved_ratio = int( + (float(target_achieved)/float(sample_games))*100) + info += '; Target Achieved in %d%% of games'\ + % (target_achieved_ratio) + + if verbose: + self._logger.info(info) + return avg_reward + + def _do_snapshot_if_in_interval(self, episode): + do_snapshot =\ + (episode != 0 and (episode % self._snapshot_interval == 0)) + if do_snapshot: + self.save_parameters(episode=episode) + self._evaluate() + + def _evaluate(self, verbose=True): + avg_reward = self.evaluate( + sample_games=self._evaluation_samples, verbose=False) + info = 'Evaluation -> Average Reward in {} games: {}'.format( + self._evaluation_samples, avg_reward) + + if self._best_avg_score is None or self._best_avg_score <= avg_reward: + self._save_current_as_best_net() + self._best_avg_score = avg_reward + if verbose: + self._logger.info(info) + + def _is_target_reached(self, avg_reward): + return self._target_score is not None\ + and avg_reward > self._target_score + + def _do_training(self): + return (self._total_steps % self._train_interval == 0) and\ + (self._memory.is_sample_possible(self._minibatch_size)) and\ + (self._current_episode >= self._start_training) + + def _check_interrupt_routine(self): + if self._interrupt_flag: + self._interrupt_flag = False + self._interrupt_training() + return True + return False + + def _is_target_reached(self, avg_reward): + return self._target_score is not None\ + and avg_reward > self._target_score + + def _save_parameters(self, net, episode=None, filename='dqn-agent-params'): + assert self._output_directory + assert isinstance(net, gluon.HybridBlock) + make_directory_if_not_exist(self._output_directory) + + if(episode is not None): + self._logger.info( + 'Saving model parameters after episode %d' % episode) + filename = filename + '-ep{}'.format(episode) + else: + self._logger.info('Saving model parameters') + self._save_net(net, filename) + + def _save_net(self, net, filename, filedir=None): + filedir = self._output_directory if filedir is None else filedir + filename = os.path.join(filedir, filename + '.params') + net.save_parameters(filename) + + def save_best_network(self, path, epoch=0): + self._logger.info( + 'Saving best network with average reward of {}'.format( + self._best_avg_score)) + self._best_net.export(path, epoch=epoch) + + def _get_session_dir(self): + session_dir = os.path.join( + self._output_directory, '.interrupted_session') + make_directory_if_not_exist(session_dir) + return session_dir + + def _save_current_as_best_net(self): + raise NotImplementedError + + def get_next_action(self, state): + raise NotImplementedError + + def save_parameters(self, episode): + raise NotImplementedError + + def train(self, episodes=None): + raise NotImplementedError + + +class DdpgAgent(Agent): + def __init__( + self, + actor, + critic, + environment, + replay_memory_params, + strategy_params, + state_dim, + action_dim, + soft_target_update_rate=.001, + actor_optimizer='adam', + actor_optimizer_params={'learning_rate': 0.0001}, + critic_optimizer='adam', + critic_optimizer_params={'learning_rate': 0.001}, + ctx=None, + discount_factor=.9, + training_episodes=50, + start_training=20, + train_interval=1, + snapshot_interval=200, + agent_name='DdpgAgent', + max_episode_step=9999, + evaluation_samples=100, + output_directory='model_parameters', + verbose=True, + target_score=None + ): + super(DdpgAgent, self).__init__( + environment=environment, replay_memory_params=replay_memory_params, + strategy_params=strategy_params, state_dim=state_dim, + action_dim=action_dim, ctx=ctx, discount_factor=discount_factor, + training_episodes=training_episodes, start_training=start_training, + train_interval=train_interval, + snapshot_interval=snapshot_interval, agent_name=agent_name, + max_episode_step=max_episode_step, + output_directory=output_directory, verbose=verbose, + target_score=target_score, evaluation_samples=evaluation_samples) + assert critic is not None, 'Critic not set' + assert actor is not None, 'Actor is not set' + assert soft_target_update_rate > 0,\ + 'Target update must be greater zero' + assert actor_optimizer is not None, 'No actor optimizer set' + assert critic_optimizer is not None, 'No critic optimizer set' + + self._actor = actor + self._critic = critic + + self._actor_target = self._copy_actor() + self._critic_target = self._copy_critic() + + self._actor_optimizer = actor_optimizer + self._actor_optimizer_params = self._adjust_optimizer_params( + actor_optimizer_params) + + self._critic_optimizer = critic_optimizer + self._critic_optimizer_params = self._adjust_optimizer_params( + critic_optimizer_params) + + self._soft_target_update_rate = soft_target_update_rate + + self._logger.info( + 'Agent created with following parameters: {}'.format( + self._make_config_dict())) + + self._best_net = self._copy_actor() + + self._training_stats = DdpgTrainingStats(self._training_episodes) + + def _make_pickle_ready(self, session_dir): + super(DdpgAgent, self)._make_pickle_ready(session_dir) + self._save_net(self._actor, 'actor', session_dir) + self._actor = None + self._save_net(self._critic, 'critic', session_dir) + self._critic = None + self._save_net(self._actor_target, 'actor_target', session_dir) + self._actor_target = None + self._save_net(self._critic_target, 'critic_target', session_dir) + self._critic_target = None @classmethod - def resume_from_session(cls, session_dir, net, environment): + def resume_from_session(cls, session_dir, actor, critic, environment): import pickle if not os.path.exists(session_dir): raise ValueError('Session directory does not exist') @@ -121,386 +385,516 @@ class DqnAgent(object): files = dict() files['agent'] = os.path.join(session_dir, 'agent.p') files['best_net_params'] = os.path.join(session_dir, 'best_net.params') - files['q_net_params'] = os.path.join(session_dir, 'qnet.params') - files['target_net_params'] = os.path.join(session_dir, 'target_net.params') + files['actor_net_params'] = os.path.join(session_dir, 'actor.params') + files['actor_target_net_params'] = os.path.join( + session_dir, 'actor_target.params') + files['critic_net_params'] = os.path.join(session_dir, 'critic.params') + files['critic_target_net_params'] = os.path.join( + session_dir, 'critic_target.params') for file in files.values(): if not os.path.exists(file): - raise ValueError('Session directory is not complete: {} is missing'.format(file)) + raise ValueError( + 'Session directory is not complete: {} is missing' + .format(file)) with open(files['agent'], 'rb') as f: agent = pickle.load(f) - agent.__environment = environment - agent.__qnet = net - agent.__qnet.load_parameters(files['q_net_params'], agent.__ctx) - agent.__qnet.hybridize() - agent.__qnet(nd.random_normal(shape=((1,) + agent.__state_dim), ctx=agent.__ctx)) - agent.__best_net = copy_net(agent.__qnet, agent.__state_dim, agent.__ctx) - agent.__best_net.load_parameters(files['best_net_params'], agent.__ctx) - agent.__target_qnet = copy_net(agent.__qnet, agent.__state_dim, agent.__ctx) - agent.__target_qnet.load_parameters(files['target_net_params'], agent.__ctx) + agent._environment = environment + + agent._actor = actor + agent._actor.load_parameters(files['actor_net_params'], agent._ctx) + agent._actor.hybridize() + agent._actor(nd.random_normal( + shape=((1,) + agent._state_dim), ctx=agent._ctx)) + + agent._best_net = copy_net(agent._actor, agent._state_dim, agent._ctx) + agent._best_net.load_parameters(files['best_net_params'], agent._ctx) + + agent._actor_target = copy_net( + agent._actor, agent._state_dim, agent._ctx) + agent._actor_target.load_parameters(files['actor_target_net_params']) - agent.__logger = agent.__setup_logging(append=True) - agent.__training_stats.logger = agent.__logger - agent.__logger.info('Agent was retrieved; Training can be continued') + agent._critic = critic + agent._critic.load_parameters(files['critic_net_params'], agent._ctx) + agent._critic.hybridize() + agent._critic( + nd.random_normal(shape=((1,) + agent._state_dim), ctx=agent._ctx), + nd.random_normal(shape=((1,) + agent._action_dim), ctx=agent._ctx)) + + agent._critic_target = copy_net_with_two_inputs( + agent._critic, agent._state_dim, agent._action_dim, agent._ctx) + agent._critic_target.load_parameters(files['critic_target_net_params']) + + agent._logger = ArchLogger.get_logger() + agent._training_stats.logger = ArchLogger.get_logger() + agent._logger.info('Agent was retrieved; Training can be continued') return agent - def __interrupt_training(self): - import pickle - self.__logger.info('Training interrupted; Store state for resuming') - session_dir = os.path.join(self.__output_directory, '.interrupted_session') - if not os.path.exists(session_dir): - os.mkdir(session_dir) + def _save_current_as_best_net(self): + self._best_net = self._copy_actor() - del self.__training_stats.logger - logger = self.__logger - self.__logger = None - self.__environment.close() - self.__environment = None + def get_next_action(self, state): + action = self._actor(nd.array([state], ctx=self._ctx)) + return action[0].asnumpy() - self.__save_net(self.__qnet, 'qnet', session_dir) - self.__qnet = None - self.__save_net(self.__best_net, 'best_net', session_dir) - self.__best_net = None - self.__save_net(self.__target_qnet, 'target_net', session_dir) - self.__target_qnet = None + def save_parameters(self, episode): + self._save_parameters(self._actor, episode=episode) - agent_session_file = os.path.join(session_dir, 'agent.p') + def train(self, episodes=None): + self.save_config_file() + self._logger.info("--- Start DDPG training ---") + episodes = \ + episodes if episodes is not None else self._training_episodes - with open(agent_session_file, 'wb') as f: - pickle.dump(self, f) - self.__logger = logger - logger.info('State successfully stored') + resume = (self._current_episode > 0) + if resume: + self._logger.info("Training session resumed") + self._logger.info( + "Starting from episode {}".format(self._current_episode)) + else: + self._training_stats = DdpgTrainingStats(episodes) - @property - def current_episode(self): - return self.__current_episode + # Initialize target Q' and mu' + self._actor_target = self._copy_actor() + self._critic_target = self._copy_critic() - @property - def environment(self): - return self.__environment + # Initialize l2 loss for critic network + l2_loss = gluon.loss.L2Loss() - def __adjust_optimizer_params(self, optimizer_params): - if 'weight_decay' in optimizer_params: - optimizer_params['wd'] = optimizer_params['weight_decay'] - del optimizer_params['weight_decay'] - if 'learning_rate_decay' in optimizer_params: - min_learning_rate = 1e-8 - if 'learning_rate_minimum' in optimizer_params: - min_learning_rate = optimizer_params['learning_rate_minimum'] - del optimizer_params['learning_rate_minimum'] - optimizer_params['lr_scheduler'] = mx.lr.scheduler.FactorScheduler( - optimizer_params['step_size'], - factor=optimizer_params['learning_rate_decay'], - stop_factor_lr=min_learning_rate) - del optimizer_params['step_size'] - del optimizer_params['learning_rate_decay'] + # Initialize critic and actor trainer + trainer_actor = gluon.Trainer( + self._actor.collect_params(), self._actor_optimizer, + self._actor_optimizer_params) + trainer_critic = gluon.Trainer( + self._critic.collect_params(), self._critic_optimizer, + self._critic_optimizer_params) - return optimizer_params + # For episode=1..n + while self._current_episode < episodes: + # Check interrupt flag + if self._check_interrupt_routine(): + return False - def set_interrupt_flag(self, interrupt): - self.__interrupt_flag = interrupt + # Initialize new episode + step = 0 + episode_reward = 0 + start = time.time() + episode_critic_loss = 0 + episode_actor_loss = 0 + episode_avg_q_value = 0 + training_steps = 0 + # Get initialial observation state s + state = self._environment.reset() + + # For step=1..T + while step < self._max_episode_step: + # Select an action a = mu(s) + N(step) according to current + # actor and exploration noise N according to strategy + action = self._strategy.select_action( + self.get_next_action(state)) + + # Execute action a and observe reward r and next state ns + next_state, reward, terminal, _ = \ + self._environment.step(action) + + self._logger.debug( + 'Applied action {} with reward {}'.format(action, reward)) + + # Store transition (s,a,r,ns) in replay buffer + self._memory.append( + state, action, reward, next_state, terminal) + + if self._do_training(): + # Sample random minibatch of b transitions + # (s_i, a_i, r_i, s_(i+1)) from replay buffer + states, actions, rewards, next_states, terminals =\ + self._sample_from_memory() + + actor_target_actions = self._actor_target(next_states) + critic_target_qvalues = self._critic_target( + next_states, actor_target_actions) + + rewards = rewards.reshape(self._minibatch_size, 1) + terminals = terminals.reshape(self._minibatch_size, 1) + + # y = r_i + discount * Q'(s_(i+1), mu'(s_(i+1))) + y = rewards + (1.0 - terminals) * self._discount_factor\ + * critic_target_qvalues + + # Train the critic network + with autograd.record(): + qvalues = self._critic(states, actions) + critic_loss = l2_loss(qvalues, y) + critic_loss.backward() + trainer_critic.step(self._minibatch_size) + + # Train the actor network + # Temporary critic so that gluon trainer does not mess + # with critic parameters + tmp_critic = self._copy_critic() + with autograd.record(): + actor_qvalues = tmp_critic(states, self._actor(states)) + # For maximizing qvalues we have to multiply with -1 + # as we use a minimizer + actor_loss = -1 * actor_qvalues + actor_loss.backward() + trainer_actor.step(self._minibatch_size) + + # Update target networks: + self._actor_target = self._soft_update( + self._actor, self._actor_target, + self._soft_target_update_rate) + self._critic_target = self._soft_update( + self._critic, self._critic_target, + self._soft_target_update_rate) + + # Update statistics + episode_critic_loss +=\ + np.sum(critic_loss.asnumpy()) / self._minibatch_size + episode_actor_loss +=\ + np.sum(actor_loss.asnumpy()) / self._minibatch_size + episode_avg_q_value +=\ + np.sum(actor_qvalues.asnumpy()) / self._minibatch_size - def __make_output_directory_if_not_exist(self): - assert self.__output_directory - if not os.path.exists(self.__output_directory): - os.makedirs(self.__output_directory) + training_steps += 1 - def __setup_logging(self, append=False): - assert self.__output_directory - assert self.__agent_name + episode_reward += reward + step += 1 + self._total_steps += 1 + state = next_state - output_level = logging.DEBUG if self.__verbose else logging.WARNING - filemode = 'a' if append else 'w' + if terminal: + # Reset the strategy + self._strategy.reset() + break + + # Log the episode results + episode_actor_loss = 0 if training_steps == 0\ + else (episode_actor_loss / training_steps) + episode_critic_loss = 0 if training_steps == 0\ + else (episode_critic_loss / training_steps) + episode_avg_q_value = 0 if training_steps == 0\ + else (episode_avg_q_value / training_steps) + + avg_reward = self._training_stats.log_episode( + self._current_episode, start, training_steps, + episode_actor_loss, episode_critic_loss, episode_avg_q_value, + self._strategy.cur_eps, episode_reward) + + self._do_snapshot_if_in_interval(self._current_episode) + self._strategy.decay(self._current_episode) + + if self._is_target_reached(avg_reward): + self._logger.info( + 'Target score is reached in average; Training is stopped') + break + + self._current_episode += 1 + + self._evaluate() + self.save_parameters(episode=self._current_episode) + self.save_best_network(os.path.join(self._output_directory, 'best')) + self._training_stats.save_stats(self._output_directory) + self._logger.info('--------- Training finished ---------') + return True + + def _make_config_dict(self): + config = super(DdpgAgent, self)._make_config_dict() + config['soft_target_update_rate'] = self._soft_target_update_rate + config['actor_optimizer'] = self._actor_optimizer + config['actor_optimizer_params'] = self._actor_optimizer_params + config['critic_optimizer'] = self._critic_optimizer + config['critic_optimizer_params'] = self._critic_optimizer_params + return config - logformat = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' - dateformat = '%d-%b-%y %H:%M:%S' - formatter = logging.Formatter(fmt=logformat, datefmt=dateformat) + def _soft_update(self, net, target, tau): + net_params = [p.data() for _, p in net.collect_params().items()] + for i, (_, p) in enumerate(target.collect_params().items()): + target_params = p.data() + p.set_data((1.0 - tau) * target_params + tau * net_params[i]) + return target + + def _copy_actor(self): + assert self._actor is not None + assert self._ctx is not None + assert type(self._state_dim) is tuple + return copy_net(self._actor, self._state_dim, ctx=self._ctx) + + def _copy_critic(self): + assert self._critic is not None + assert self._ctx is not None + assert type(self._state_dim) is tuple + assert type(self._action_dim) is tuple + return copy_net_with_two_inputs( + self._critic, self._state_dim, self._action_dim, ctx=self._ctx) + + +class DqnAgent(Agent): + def __init__( + self, + qnet, + environment, + replay_memory_params, + strategy_params, + state_dim, + action_dim, + ctx=None, + discount_factor=.9, + loss_function='euclidean', + optimizer='rmsprop', + optimizer_params={'learning_rate': 0.09}, + training_episodes=50, + start_training=0, + train_interval=1, + use_fix_target=False, + double_dqn=False, + target_update_interval=10, + snapshot_interval=200, + evaluation_samples=100, + agent_name='Dqn_agent', + max_episode_step=99999, + output_directory='model_parameters', + verbose=True, + target_score=None + ): + super(DqnAgent, self).__init__( + environment=environment, replay_memory_params=replay_memory_params, + strategy_params=strategy_params, state_dim=state_dim, + action_dim=action_dim, ctx=ctx, discount_factor=discount_factor, + training_episodes=training_episodes, start_training=start_training, + train_interval=train_interval, + snapshot_interval=snapshot_interval, agent_name=agent_name, + max_episode_step=max_episode_step, + output_directory=output_directory, verbose=verbose, + target_score=target_score, evaluation_samples=evaluation_samples) + + self._qnet = qnet + self._target_update_interval = target_update_interval + self._target_qnet = copy_net( + self._qnet, self._state_dim, ctx=self._ctx) + self._loss_function_str = loss_function + self._loss_function = get_loss_function(loss_function) + self._optimizer = optimizer + self._optimizer_params = optimizer_params + self._double_dqn = double_dqn + self._use_fix_target = use_fix_target - logger = logging.getLogger('DQNAgent') - logger.setLevel(output_level) + # Initialize best network + self._best_net = copy_net(self._qnet, self._state_dim, self._ctx) + self._best_avg_score = -np.infty - stream_handler = logging.StreamHandler(sys.stdout) - stream_handler.setLevel(output_level) - stream_handler.setFormatter(formatter) - logger.addHandler(stream_handler) + self._training_stats = None - if self.__make_logfile: - self.__make_output_directory_if_not_exist() - log_file = os.path.join(self.__output_directory, self.__agent_name + '.log') - file_handler = logging.FileHandler(log_file, mode=filemode) - file_handler.setLevel(output_level) - file_handler.setFormatter(formatter) - logger.addHandler(file_handler) + @classmethod + def resume_from_session(cls, session_dir, net, environment): + import pickle + if not os.path.exists(session_dir): + raise ValueError('Session directory does not exist') - return logger + files = dict() + files['agent'] = os.path.join(session_dir, 'agent.p') + files['best_net_params'] = os.path.join(session_dir, 'best_net.params') + files['q_net_params'] = os.path.join(session_dir, 'qnet.params') + files['target_net_params'] = os.path.join( + session_dir, 'target_net.params') - def __is_target_reached(self, avg_reward): - return self.__target_score is not None\ - and avg_reward > self.__target_score + for file in files.values(): + if not os.path.exists(file): + raise ValueError( + 'Session directory is not complete: {} is missing' + .format(file)) + with open(files['agent'], 'rb') as f: + agent = pickle.load(f) + + agent._environment = environment + agent._qnet = net + agent._qnet.load_parameters(files['q_net_params'], agent._ctx) + agent._qnet.hybridize() + agent._qnet(nd.random_normal( + shape=((1,) + agent._state_dim), ctx=agent._ctx)) + agent._best_net = copy_net(agent._qnet, agent._state_dim, agent._ctx) + agent._best_net.load_parameters(files['best_net_params'], agent._ctx) + agent._target_qnet = copy_net( + agent._qnet, agent._state_dim, agent._ctx) + agent._target_qnet.load_parameters( + files['target_net_params'], agent._ctx) + + agent._logger = ArchLogger.get_logger() + agent._training_stats.logger = agent._logger + agent._logger.info('Agent was retrieved; Training can be continued') + + return agent + + def _make_pickle_ready(self, session_dir): + super(DqnAgent, self)._make_pickle_ready(session_dir) + self._save_net(self._qnet, 'qnet', session_dir) + self._qnet = None + self._save_net(self._target_qnet, 'target_net', session_dir) + self._target_qnet = None def get_q_values(self, state, with_best=False): - return self.get_batch_q_values(nd.array([state], ctx=self.__ctx), with_best=with_best)[0] + return self.get_batch_q_values( + nd.array([state], ctx=self._ctx), with_best=with_best)[0] def get_batch_q_values(self, state_batch, with_best=False): - return self.__best_net(state_batch) if with_best else self.__qnet(state_batch) + return self._best_net(state_batch)\ + if with_best else self._qnet(state_batch) def get_next_action(self, state, with_best=False): q_values = self.get_q_values(state, with_best=with_best) action = q_values.asnumpy().argmax() - return q_values.asnumpy().argmax() + return action - def __sample_from_memory(self): - states, actions, rewards, next_states, terminals\ - = self.__memory.sample(batch_size=self.__minibatch_size) - states = nd.array(states, ctx=self.__ctx) - actions = nd.array(actions, ctx=self.__ctx) - rewards = nd.array(rewards, ctx=self.__ctx) - next_states = nd.array(next_states, ctx=self.__ctx) - terminals = nd.array(terminals, ctx=self.__ctx) - return states, actions, rewards, next_states, terminals - - def __determine_target_q_values(self, states, actions, rewards, next_states, terminals): - if self.__use_fix_target: - q_max_val = self.__target_qnet(next_states) + def __determine_target_q_values( + self, states, actions, rewards, next_states, terminals + ): + if self._use_fix_target: + q_max_val = self._target_qnet(next_states) else: - q_max_val = self.__qnet(next_states) + q_max_val = self._qnet(next_states) - if self.__double_dqn: - q_values_next_states = self.__qnet(next_states) - target_rewards = rewards + nd.choose_element_0index(q_max_val, nd.argmax_channel(q_values_next_states))\ - * (1.0 - terminals) * self.__discount_factor + if self._double_dqn: + q_values_next_states = self._qnet(next_states) + target_rewards = rewards + nd.choose_element_0index( + q_max_val, nd.argmax_channel(q_values_next_states))\ + * (1.0 - terminals) * self._discount_factor else: - target_rewards = rewards + nd.choose_element_0index(q_max_val, nd.argmax_channel(q_max_val))\ - * (1.0 - terminals) * self.__discount_factor + target_rewards = rewards + nd.choose_element_0index( + q_max_val, nd.argmax_channel(q_max_val))\ + * (1.0 - terminals) * self._discount_factor - target_qval = self.__qnet(states) + target_qval = self._qnet(states) for t in range(target_rewards.shape[0]): target_qval[t][actions[t]] = target_rewards[t] return target_qval def __train_q_net_step(self, trainer): - states, actions, rewards, next_states, terminals = self.__sample_from_memory() - target_qval = self.__determine_target_q_values(states, actions, rewards, next_states, terminals) + states, actions, rewards, next_states, terminals =\ + self._sample_from_memory() + target_qval = self.__determine_target_q_values( + states, actions, rewards, next_states, terminals) with autograd.record(): - q_values = self.__qnet(states) - loss = self.__loss_function(q_values, target_qval) + q_values = self._qnet(states) + loss = self._loss_function(q_values, target_qval) loss.backward() - trainer.step(self.__minibatch_size) + trainer.step(self._minibatch_size) return loss - def __do_snapshot_if_in_interval(self, episode): - do_snapshot = (episode != 0 and (episode % self.__snapshot_interval == 0)) - if do_snapshot: - self.save_parameters(episode=episode) - self.__evaluate() - def __do_target_update_if_in_interval(self, total_steps): - do_target_update = (self.__use_fix_target and total_steps % self.__target_update_interval == 0) + do_target_update = ( + self._use_fix_target and + (total_steps % self._target_update_interval == 0)) if do_target_update: - self.__logger.info('Target network is updated after {} steps'.format(total_steps)) - self.__target_qnet = copy_net(self.__qnet, self.__state_dim, self.__ctx) + self._logger.info( + 'Target network is updated after {} steps'.format(total_steps)) + self._target_qnet = copy_net( + self._qnet, self._state_dim, self._ctx) def train(self, episodes=None): - self.__logger.info("--- Start training ---") - trainer = gluon.Trainer(self.__qnet.collect_params(), self.__optimizer, self.__adjust_optimizer_params(self.__optimizer_params)) - episodes = episodes if episodes != None else self.__training_episodes - - resume = (self.__current_episode > 0) + self.save_config_file() + self._logger.info("--- Start training ---") + trainer = gluon.Trainer( + self._qnet.collect_params(), + self._optimizer, + self._adjust_optimizer_params(self._optimizer_params)) + episodes = episodes if episodes is not None\ + else self._training_episodes + + resume = (self._current_episode > 0) if resume: - self.__logger.info("Training session resumed") - self.__logger.info("Starting from episode {}".format(self.__current_episode)) + self._logger.info("Training session resumed") + self._logger.info("Starting from episode {}".format( + self._current_episode)) else: - self.__training_stats = util.TrainingStats(self.__logger, episodes, self.__live_plot) + self._training_stats = DqnTrainingStats(episodes) - # Implementation Deep Q Learning described by Mnih et. al. in Playing Atari with Deep Reinforcement Learning - while self.__current_episode < episodes: - # Check interrupt flag - if self.__interrupt_flag: - self.__interrupt_flag = False - self.__interrupt_training() + # Implementation Deep Q Learning described by + # Mnih et. al. in Playing Atari with Deep Reinforcement Learning + while self._current_episode < episodes: + if self._check_interrupt_routine(): return False step = 0 episode_reward = 0 start = time.time() - state = self.__environment.reset() + state = self._environment.reset() episode_loss = 0 training_steps = 0 - while step < self.__max_episode_step: - #1. Choose an action based on current game state and policy - q_values = self.__qnet(nd.array([state], ctx=self.__ctx)) - action = self.__policy.select_action(q_values[0]) + while step < self._max_episode_step: + # 1. Choose an action based on current game state and policy + q_values = self._qnet(nd.array([state], ctx=self._ctx)) + action = self._strategy.select_action(q_values[0]) - #2. Play the game for a single step - next_state, reward, terminal, _ = self.__environment.step(action) + # 2. Play the game for a single step + next_state, reward, terminal, _ =\ + self._environment.step(action) - #3. Store transition in replay memory - self.__memory.append(state, action, reward, next_state, terminal) + # 3. Store transition in replay memory + self._memory.append( + state, action, reward, next_state, terminal) - #4. Train the network if in interval - do_training = (self.__total_steps % self.__train_interval == 0\ - and self.__memory.is_sample_possible(self.__minibatch_size)) - if do_training: + # 4. Train the network if in interval + if self._do_training(): loss = self.__train_q_net_step(trainer) - loss_sum = sum(loss).asnumpy()[0] - episode_loss += float(loss_sum)/float(self.__minibatch_size) training_steps += 1 + episode_loss +=\ + np.sum(loss.asnumpy()) / self._minibatch_size # Update target network if in interval - self.__do_target_update_if_in_interval(self.__total_steps) + self.__do_target_update_if_in_interval(self._total_steps) step += 1 - self.__total_steps += 1 + self._total_steps += 1 episode_reward += reward state = next_state if terminal: - episode_loss = episode_loss if training_steps > 0 else None - _, _, avg_reward = self.__training_stats.log_episode(self.__current_episode, start, training_steps, - episode_loss, self.__policy.cur_eps, episode_reward) + self._strategy.reset() break - self.__do_snapshot_if_in_interval(self.__current_episode) - self.__policy.decay() - - if self.__is_target_reached(avg_reward): - self.__logger.info('Target score is reached in average; Training is stopped') - break - - self.__current_episode += 1 + self._do_snapshot_if_in_interval(self._current_episode) - self.__evaluate() - training_stats_file = os.path.join(self.__output_directory, 'training_stats.pdf') - self.__training_stats.save_stats(training_stats_file) - self.__logger.info('--------- Training finished ---------') - return True - - def __save_net(self, net, filename, filedir=None): - filedir = self.__output_directory if filedir is None else filedir - filename = os.path.join(filedir, filename + '.params') - net.save_parameters(filename) + episode_loss = (episode_loss / training_steps)\ + if training_steps > 0 else 0 + avg_reward = self._training_stats.log_episode( + self._current_episode, start, training_steps, + episode_loss, self._strategy.cur_eps, episode_reward) + self._strategy.decay(self._current_episode) - def save_parameters(self, episode=None, filename='dqn-agent-params'): - assert self.__output_directory - self.__make_output_directory_if_not_exist() - - if(episode != None): - self.__logger.info('Saving model parameters after episode %d' % episode) - filename = filename + '-ep{}'.format(episode) - else: - self.__logger.info('Saving model parameters') - self.__save_net(self.__qnet, filename) - - def evaluate(self, target=None, sample_games=100, verbose=True): - target = self.__target_score if target is None else target - if target: - target_achieved = 0 - total_reward = 0 - - for g in range(sample_games): - state = self.__environment.reset() - step = 0 - game_reward = 0 - while step < self.__max_episode_step: - action = self.get_next_action(state) - state, reward, terminal, _ = self.__environment.step(action) - game_reward += reward - - if terminal: - if verbose: - info = 'Game %d: Reward %f' % (g,game_reward) - self.__logger.debug(info) - if target: - if game_reward >= target: - target_achieved += 1 - total_reward += game_reward - break - - step += 1 - - avg_reward = float(total_reward)/float(sample_games) - info = 'Avg. Reward: %f' % avg_reward - if target: - target_achieved_ratio = int((float(target_achieved)/float(sample_games))*100) - info += '; Target Achieved in %d%% of games' % (target_achieved_ratio) - - if verbose: - self.__logger.info(info) - return avg_reward - - def __evaluate(self, verbose=True): - sample_games = 100 - avg_reward = self.evaluate(sample_games=sample_games, verbose=False) - info = 'Evaluation -> Average Reward in {} games: {}'.format(sample_games, avg_reward) - - if self.__best_avg_score is None or self.__best_avg_score <= avg_reward: - self.__best_net = copy_net(self.__qnet, self.__state_dim, self.__ctx) - self.__best_avg_score = avg_reward - info += ' (NEW BEST)' - - if verbose: - self.__logger.info(info) - - - - def play(self, update_frame=1, with_best=False): - step = 0 - state = self.__environment.reset() - total_reward = 0 - while step < self.__max_episode_step: - action = self.get_next_action(state, with_best=with_best) - state, reward, terminal, _ = self.__environment.step(action) - total_reward += reward - do_update_frame = (step % update_frame == 0) - if do_update_frame: - self.__environment.render() - time.sleep(.100) - - if terminal: + if self._is_target_reached(avg_reward): + self._logger.info( + 'Target score is reached in average; Training is stopped') break - step += 1 - return total_reward + self._current_episode += 1 - def save_best_network(self, path, epoch=0): - self.__logger.info('Saving best network with average reward of {}'.format(self.__best_avg_score)) - self.__best_net.export(path, epoch=epoch) + self._evaluate() + self.save_parameters(episode=self._current_episode) + self.save_best_network(os.path.join(self._output_directory, 'best')) + self._training_stats.save_stats(self._output_directory) + self._logger.info('--------- Training finished ---------') + return True - def __make_config_dict(self): - config = dict() - config['discount_factor'] = self.__discount_factor - config['optimizer'] = self.__optimizer - config['optimizer_params'] = self.__optimizer_params - config['policy_params'] = self.__policy_params - config['replay_memory_params'] = self.__replay_memory_params - config['loss_function'] = self.__loss_function_str - config['optimizer'] = self.__optimizer - config['training_episodes'] = self.__training_episodes - config['train_interval'] = self.__train_interval - config['use_fix_target'] = self.__use_fix_target - config['double_dqn'] = self.__double_dqn - config['target_update_interval'] = self.__target_update_interval - config['snapshot_interval']= self.__snapshot_interval - config['agent_name'] = self.__agent_name - config['max_episode_step'] = self.__max_episode_step - config['output_directory'] = self.__user_given_directory - config['verbose'] = self.__verbose - config['live_plot'] = self.__live_plot - config['make_logfile'] = self.__make_logfile - config['target_score'] = self.__target_score + def _make_config_dict(self): + config = super(DqnAgent, self)._make_config_dict() + config['optimizer'] = self._optimizer + config['optimizer_params'] = self._optimizer_params + config['loss_function'] = self._loss_function_str + config['use_fix_target'] = self._use_fix_target + config['double_dqn'] = self._double_dqn + config['target_update_interval'] = self._target_update_interval return config - def save_config_file(self): - import json - self.__make_output_directory_if_not_exist() - filename = os.path.join(self.__output_directory, 'config.json') - config = self.__make_config_dict() - with open(filename, mode='w') as fp: - json.dump(config, fp, indent=4) \ No newline at end of file + def save_parameters(self, episode): + self._save_parameters(self._qnet, episode=episode) + + def _save_current_as_best_net(self): + self._best_net = copy_net( + self._qnet, (1,) + self._state_dim, ctx=self._ctx) diff --git a/src/main/resources/templates/gluon/reinforcement/agent/ReplayMemory.ftl b/src/main/resources/templates/gluon/reinforcement/agent/ReplayMemory.ftl index e66cd93..d22147a 100644 --- a/src/main/resources/templates/gluon/reinforcement/agent/ReplayMemory.ftl +++ b/src/main/resources/templates/gluon/reinforcement/agent/ReplayMemory.ftl @@ -1,58 +1,90 @@ import numpy as np + class ReplayMemoryBuilder(object): def __init__(self): self.__supported_methods = ['online', 'buffer', 'combined'] - def build_by_params(self, + def build_by_params( + self, state_dim, method='online', state_dtype='float32', + action_dim=(1,), action_dtype='uint8', rewards_dtype='float32', memory_size=1000, - sample_size=32): + sample_size=32 + ): assert state_dim is not None + assert action_dim is not None assert method in self.__supported_methods if method == 'online': - return self.build_online_memory(state_dim=state_dim, state_dtype=state_dtype, - action_dtype=action_dtype, rewards_dtype=rewards_dtype) + return self.build_online_memory( + state_dim=state_dim, state_dtype=state_dtype, + action_dtype=action_dtype, action_dim=action_dim, + rewards_dtype=rewards_dtype) else: assert memory_size is not None and memory_size > 0 assert sample_size is not None and sample_size > 0 if method == 'buffer': - return self.build_buffered_memory(state_dim=state_dim, sample_size=sample_size, - memory_size=memory_size, state_dtype=state_dtype, action_dtype=action_dtype, + return self.build_buffered_memory( + state_dim=state_dim, sample_size=sample_size, + memory_size=memory_size, state_dtype=state_dtype, + action_dim=action_dim, action_dtype=action_dtype, rewards_dtype=rewards_dtype) else: - return self.build_combined_memory(state_dim=state_dim, sample_size=sample_size, - memory_size=memory_size, state_dtype=state_dtype, action_dtype=action_dtype, + return self.build_combined_memory( + state_dim=state_dim, sample_size=sample_size, + memory_size=memory_size, state_dtype=state_dtype, + action_dim=action_dim, action_dtype=action_dtype, rewards_dtype=rewards_dtype) - def build_buffered_memory(self, state_dim, memory_size=1000, sample_size=1, state_dtype='float32', - action_dtype='uint8', rewards_dtype='float32'): + def build_buffered_memory( + self, state_dim, memory_size, sample_size, state_dtype, action_dim, + action_dtype, rewards_dtype + ): assert memory_size > 0 assert sample_size > 0 - return ReplayMemory(state_dim, size=memory_size, sample_size=sample_size, - state_dtype=state_dtype, action_dtype=action_dtype, rewards_dtype=rewards_dtype) - - def build_combined_memory(self, state_dim, memory_size=1000, sample_size=1, state_dtype='float32', - action_dtype='uint8', rewards_dtype='float32'): + return ReplayMemory( + state_dim, size=memory_size, sample_size=sample_size, + state_dtype=state_dtype, action_dim=action_dim, + action_dtype=action_dtype, rewards_dtype=rewards_dtype) + + def build_combined_memory( + self, state_dim, memory_size, sample_size, state_dtype, action_dim, + action_dtype, rewards_dtype + ): assert memory_size > 0 assert sample_size > 0 - return CombinedReplayMemory(state_dim, size=memory_size, sample_size=sample_size, - state_dtype=state_dtype, action_dtype=action_dtype, rewards_dtype=rewards_dtype) + return CombinedReplayMemory( + state_dim, size=memory_size, sample_size=sample_size, + state_dtype=state_dtype, action_dim=action_dim, + action_dtype=action_dtype, rewards_dtype=rewards_dtype) + + def build_online_memory( + self, state_dim, state_dtype, action_dtype, action_dim, rewards_dtype + ): + return OnlineReplayMemory( + state_dim, state_dtype=state_dtype, action_dim=action_dim, + action_dtype=action_dtype, rewards_dtype=rewards_dtype) - def build_online_memory(self, state_dim, state_dtype='float32', action_dtype='uint8', - rewards_dtype='float32'): - return OnlineReplayMemory(state_dim, state_dtype=state_dtype, action_dtype=action_dtype, - rewards_dtype=rewards_dtype) class ReplayMemory(object): - def __init__(self, state_dim, sample_size, size=1000, state_dtype='uint8', action_dtype='uint8', rewards_dtype='float32'): + def __init__( + self, + state_dim, + sample_size, + size=1000, + action_dim=(1,), + state_dtype='float32', + action_dtype='uint8', + rewards_dtype='float32' + ): assert size > 0, "Size must be greater than zero" assert type(state_dim) is tuple, "State dimension must be a tuple" + assert type(action_dim) is tuple, "Action dimension must be a tuple" assert sample_size > 0 self._size = size self._sample_size = sample_size @@ -60,12 +92,15 @@ class ReplayMemory(object): self._pointer = 0 self._state_dim = state_dim self._state_dtype = state_dtype + self._action_dim = action_dim self._action_dtype = action_dtype self._rewards_dtype = rewards_dtype self._states = np.zeros((self._size,) + state_dim, dtype=state_dtype) - self._actions = np.array([0] * self._size, dtype=action_dtype) + self._actions = np.zeros( + (self._size,) + action_dim, dtype=action_dtype) self._rewards = np.array([0] * self._size, dtype=rewards_dtype) - self._next_states = np.zeros((self._size,) + state_dim, dtype=state_dtype) + self._next_states = np.zeros( + (self._size,) + state_dim, dtype=state_dtype) self._terminals = np.array([0] * self._size, dtype='bool') @property @@ -93,17 +128,23 @@ class ReplayMemory(object): self._terminals[index] def is_sample_possible(self, batch_size=None): - batch_size = batch_size if batch_size is not None else self._sample_size + batch_size = batch_size if batch_size is not None\ + else self._sample_size return self._cur_size >= batch_size def sample(self, batch_size=None): - batch_size = batch_size if batch_size is not None else self._sample_size - assert self._cur_size >= batch_size, "Size of replay memory must be larger than batch size" - i=0 - states = np.zeros((batch_size,)+self._state_dim, dtype=self._state_dtype) - actions = np.zeros(batch_size, dtype=self._action_dtype) + batch_size = batch_size if batch_size is not None\ + else self._sample_size + assert self._cur_size >= batch_size,\ + "Size of replay memory must be larger than batch size" + i = 0 + states = np.zeros(( + batch_size,)+self._state_dim, dtype=self._state_dtype) + actions = np.zeros( + (batch_size,)+self._action_dim, dtype=self._action_dtype) rewards = np.zeros(batch_size, dtype=self._rewards_dtype) - next_states = np.zeros((batch_size,)+self._state_dim, dtype=self._state_dtype) + next_states = np.zeros( + (batch_size,)+self._state_dim, dtype=self._state_dtype) terminals = np.zeros(batch_size, dtype='bool') while i < batch_size: @@ -119,25 +160,35 @@ class ReplayMemory(object): class OnlineReplayMemory(ReplayMemory): - def __init__(self, state_dim, state_dtype='float32', action_dtype='uint8', rewards_dtype='float32'): - super(OnlineReplayMemory, self).__init__(state_dim, sample_size=1, size=1, - state_dtype=state_dtype, action_dtype=action_dtype, rewards_dtype=rewards_dtype) + def __init__( + self, state_dim, state_dtype='float32', action_dim=(1,), + action_dtype='uint8', rewards_dtype='float32' + ): + super(OnlineReplayMemory, self).__init__( + state_dim, sample_size=1, size=1, state_dtype=state_dtype, + action_dim=action_dim, action_dtype=action_dtype, + rewards_dtype=rewards_dtype) class CombinedReplayMemory(ReplayMemory): - def __init__(self, state_dim, sample_size, size=1000, - state_dtype='uint8', action_dtype='uint8', rewards_dtype='float32'): - super(CombinedReplayMemory, self).__init__(state_dim, sample_size=(sample_size - 1), size=size, - state_dtype=state_dtype, action_dtype=action_dtype, rewards_dtype=rewards_dtype) + def __init__( + self, state_dim, sample_size, size=1000, state_dtype='float32', + action_dim=(1,), action_dtype='uint8', rewards_dtype='float32' + ): + super(CombinedReplayMemory, self).__init__( + state_dim=state_dim, sample_size=(sample_size - 1), size=size, + state_dtype=state_dtype, action_dim=action_dim, + action_dtype=action_dtype, rewards_dtype=rewards_dtype) self._last_state = np.zeros((1,) + state_dim, dtype=state_dtype) - self._last_action = np.array([0], dtype=action_dtype) + self._last_action = np.array((1,) + action_dim, dtype=action_dtype) self._last_reward = np.array([0], dtype=rewards_dtype) self._last_next_state = np.zeros((1,) + state_dim, dtype=state_dtype) self._last_terminal = np.array([0], dtype='bool') def append(self, state, action, reward, next_state, terminal): - super(CombinedReplayMemory, self).append(state, action, reward, next_state, terminal) + super(CombinedReplayMemory, self).append( + state, action, reward, next_state, terminal) self._last_state = state self._last_action = action self._last_reward = reward @@ -145,8 +196,10 @@ class CombinedReplayMemory(ReplayMemory): self._last_terminal = terminal def sample(self, batch_size=None): - batch_size = (batch_size-1) if batch_size is not None else self._sample_size - states, actions, rewards, next_states, terminals = super(CombinedReplayMemory, self).sample(batch_size=batch_size) + batch_size = (batch_size-1) if batch_size is not None\ + else self._sample_size + states, actions, rewards, next_states, terminals = super( + CombinedReplayMemory, self).sample(batch_size=batch_size) states = np.append(states, [self._last_state], axis=0) actions = np.append(actions, [self._last_action], axis=0) rewards = np.append(rewards, [self._last_reward], axis=0) diff --git a/src/main/resources/templates/gluon/reinforcement/agent/Strategy.ftl b/src/main/resources/templates/gluon/reinforcement/agent/Strategy.ftl new file mode 100644 index 0000000..4c47adc --- /dev/null +++ b/src/main/resources/templates/gluon/reinforcement/agent/Strategy.ftl @@ -0,0 +1,172 @@ +import numpy as np + + +class StrategyBuilder(object): + def __init__(self): + pass + + def build_by_params( + self, + method='epsgreedy', + epsilon=0.5, + min_epsilon=0.05, + epsilon_decay_method='no', + epsilon_decay=0.0, + epsilon_decay_start=0, + action_dim=None, + action_low=None, + action_high=None, + mu=0.0, + theta=0.5, + sigma=0.3 + ): + + if epsilon_decay_method == 'linear': + decay = LinearDecay( + eps_decay=epsilon_decay, min_eps=min_epsilon, + decay_start=epsilon_decay_start) + else: + decay = NoDecay() + + if method == 'epsgreedy': + assert action_dim is not None + assert len(action_dim) == 1 + return EpsilonGreedyStrategy( + eps=epsilon, number_of_actions=action_dim[0], + decay_method=decay) + elif method == 'ornstein_uhlenbeck': + assert action_dim is not None + assert action_low is not None + assert action_high is not None + assert mu is not None + assert theta is not None + assert sigma is not None + return OrnsteinUhlenbeckStrategy( + action_dim, action_low, action_high, epsilon, mu, theta, + sigma, decay) + else: + assert action_dim is not None + assert len(action_dim) == 1 + return GreedyStrategy() + + +class BaseDecay(object): + def __init__(self): + pass + + def decay(self, *args): + raise NotImplementedError + + def __call__(self, *args): + return self.decay(*args) + + +class NoDecay(BaseDecay): + def __init__(self): + super(NoDecay, self).__init__() + + def decay(self, cur_eps, episode): + return cur_eps + + +class LinearDecay(BaseDecay): + def __init__(self, eps_decay, min_eps=0, decay_start=0): + super(LinearDecay, self).__init__() + self.eps_decay = eps_decay + self.min_eps = min_eps + self.decay_start = decay_start + + def decay(self, cur_eps, episode): + if episode < self.decay_start: + return cur_eps + else: + return max(cur_eps - self.eps_decay, self.min_eps) + + +class BaseStrategy(object): + def __init__(self, decay_method): + self._decay_method = decay_method + + def select_action(self, values, decay_method): + raise NotImplementedError + + def decay(self, episode): + self.cur_eps = self._decay_method.decay(self.cur_eps, episode) + + def reset(self): + pass + + +class EpsilonGreedyStrategy(BaseStrategy): + def __init__(self, eps, number_of_actions, decay_method): + super(EpsilonGreedyStrategy, self).__init__(decay_method) + self.eps = eps + self.cur_eps = eps + self.__number_of_actions = number_of_actions + + def select_action(self, values): + do_exploration = (np.random.rand() < self.cur_eps) + if do_exploration: + action = np.random.randint(low=0, high=self.__number_of_actions) + else: + action = values.asnumpy().argmax() + return action + + +class GreedyStrategy(BaseStrategy): + def __init__(self): + super(GreedyStrategy, self).__init__(None) + + def select_action(self, values): + return values.asnumpy().argmax() + + def decay(self): + pass + + +class OrnsteinUhlenbeckStrategy(BaseStrategy): + """ + Ornstein-Uhlenbeck process: dxt = theta * (mu - xt) * dt + sigma * dWt + where Wt denotes the Wiener process. + """ + def __init__( + self, + action_dim, + action_low, + action_high, + eps, + mu=0.0, + theta=.15, + sigma=.3, + decay=NoDecay() + ): + super(OrnsteinUhlenbeckStrategy, self).__init__(decay) + self.eps = eps + self.cur_eps = eps + + self._decay_method = decay + + self._action_dim = action_dim + self._action_low = action_low + self._action_high = action_high + + self._mu = np.array(mu) + self._theta = np.array(theta) + self._sigma = np.array(sigma) + + self.state = np.ones(self._action_dim) * self._mu + + def _evolve_state(self): + x = self.state + dx = self._theta * (self._mu - x)\ + + self._sigma * np.random.randn(len(x)) + self.state = x + dx + return self.state + + def reset(self): + self.state = np.ones(self._action_dim) * self._mu + + def select_action(self, values): + noise = self._evolve_state() + action = values + (self.cur_eps * noise) + return np.clip(action, self._action_low, self._action_high) diff --git a/src/main/resources/templates/gluon/reinforcement/architecture/CriticArchitecture.ftl b/src/main/resources/templates/gluon/reinforcement/architecture/CriticArchitecture.ftl new file mode 100644 index 0000000..6fc85df --- /dev/null +++ b/src/main/resources/templates/gluon/reinforcement/architecture/CriticArchitecture.ftl @@ -0,0 +1,7 @@ +architecture ${architectureName}() { + def input ${stateType}<#if stateRange??>(<#if stateRange.isLowerLimitInfinity()>-oo<#else>${stateRange.lowerLimit.get()}:<#if stateRange.isUpperLimitInfinity()>oo<#else>${stateRange.upperLimit.get()})^{<#list stateDimension as d>${d}<#if d?has_next>,} state + def input ${actionType}<#if actionRange??>(<#if actionRange.isLowerLimitInfinity()>-oo<#else>${actionRange.lowerLimit.get()}:<#if actionRange.isUpperLimitInfinity()>oo<#else>${actionRange.upperLimit.get()})^{<#list actionDimension as d>${d}<#if d?has_next>,} action + def output Q(-oo:oo)^{1} qvalue + + ${implementation}->FullyConnected(units=1)->qvalue +} \ No newline at end of file diff --git a/src/main/resources/templates/gluon/reinforcement/environment/Environment.ftl b/src/main/resources/templates/gluon/reinforcement/environment/Environment.ftl index e2104ee..2b76d40 100644 --- a/src/main/resources/templates/gluon/reinforcement/environment/Environment.ftl +++ b/src/main/resources/templates/gluon/reinforcement/environment/Environment.ftl @@ -13,9 +13,11 @@ class RewardFunction(object): self.__reward_wrapper.init() def reward(self, state, terminal): + s = state.astype('${config.rewardFunctionStateParameter.dtype}') + t = bool(terminal) inp = ${config.rewardFunctionName}_executor.${config.rewardFunctionName}_input() - inp.${config.rewardFunctionStateParameter.name} = state - inp.${config.rewardFunctionTerminalParameter.name} = terminal + inp.${config.rewardFunctionStateParameter.name} = s + inp.${config.rewardFunctionTerminalParameter.name} = t output = self.__reward_wrapper.execute(inp) return output.${config.rewardFunctionOutputName} @@ -57,13 +59,6 @@ class GymEnvironment(Environment): def state_dim(self): return self.__env.observation_space.shape - @property - def state_dtype(self): - return 'float32' - - @property - def action_dtype(self): - return 'uint8' @property def number_of_actions(self): @@ -100,11 +95,12 @@ class GymEnvironment(Environment): def render(self): self.__env.render() <#else> +<#assign action_ros_datatype=config.discreteRlAlgorithm?string("Int32", "Float32MultiArray")> import rospy import thread import numpy as np import time -from std_msgs.msg import Float32MultiArray, Bool, Int32 +from std_msgs.msg import Float32MultiArray, Bool, Int32, MultiArrayDimension, Float32 class RosEnvironment(Environment): def __init__(self, @@ -114,19 +110,21 @@ class RosEnvironment(Environment): action_topic='action', reset_topic='reset', terminal_state_topic='terminal', - meta_topic='meta', - greeting_topic='greeting'): + reward_topic='reward'): super(RosEnvironment, self).__init__() self.__timeout_in_s = timeout_in_s - self.__waiting_for_state_update = False self.__waiting_for_terminal_update = False self.__last_received_state = 0 - self.__last_received_terminal = 0 + self.__last_received_terminal = True +<#if config.hasRosRewardTopic()> + self.__last_received_reward = 0.0 + self.__waiting_for_reward_update = False + rospy.loginfo("Initialize node {0}".format(ros_node_name)) - self.__step_publisher = rospy.Publisher(action_topic, Int32, queue_size=1) + self.__step_publisher = rospy.Publisher(action_topic, ${action_ros_datatype}, queue_size=1) rospy.loginfo('Step Publisher initialized with topic {}'.format(action_topic)) self.__reset_publisher = rospy.Publisher(reset_topic, Bool, queue_size=1) @@ -139,6 +137,11 @@ class RosEnvironment(Environment): self.__terminal_state_subscriber = rospy.Subscriber(terminal_state_topic, Bool, self.__terminal_state_callback) rospy.loginfo('Terminal State Subscriber registered with topic {}'.format(terminal_state_topic)) +<#if config.hasRosRewardTopic()> + + self.__reward_subscriber = rospy.Subscriber(reward_topic, Float32, self.__reward_callback) + rospy.loginfo('Reward Subscriber registered with topic {}'.format(reward_topic)) + rate = rospy.Rate(10) @@ -156,18 +159,28 @@ class RosEnvironment(Environment): return self.__last_received_state def step(self, action): - action_rospy = Int32() + action_rospy = ${action_ros_datatype}() +<#if config.continuousRlAlgorithm> + assert len(action.shape) == 1 + action_rospy.layout.dim.append(MultiArrayDimension()) + action_rospy.layout.dim[0].label = 'action' + action_rospy.layout.dim[0].size = ${config.actionDim[0]} + action_rospy.layout.dim[0].stride = ${config.actionDim[0]} + action_rospy.data = action logger.debug('Send action: {}'.format(action)) self.__waiting_for_state_update = True self.__waiting_for_terminal_update = True +<#if config.hasRosRewardTopic()> + self.__waiting_for_reward_update = True + self.__step_publisher.publish(action_rospy) self.__wait_for_new_state(self.__step_publisher, action_rospy) next_state = self.__last_received_state terminal = self.__last_received_terminal - reward = self.__calc_reward(next_state, terminal) + reward = <#if config.hasRosRewardTopic()>self.__last_received_reward<#else>self.__calc_reward(next_state, terminal) rospy.logdebug('Calculated reward: {}'.format(reward)) return next_state, reward, terminal, 0 @@ -175,7 +188,8 @@ class RosEnvironment(Environment): def __wait_for_new_state(self, publisher, msg): time_of_timeout = time.time() + self.__timeout_in_s timeout_counter = 0 - while(self.__waiting_for_state_update or self.__waiting_for_terminal_update): + while(self.__waiting_for_state_update + or self.__waiting_for_terminal_update<#if config.hasRosRewardTopic()> or self.__waiting_for_reward_update): is_timeout = (time.time() > time_of_timeout) if (is_timeout): if timeout_counter < 3: @@ -191,27 +205,25 @@ class RosEnvironment(Environment): def close(self): rospy.signal_shutdown('Program ended!') - def __state_callback(self, data): -<#if config.rewardFunctionStateParameter.isMultiDimensional> - self.__last_received_state = np.array(data.data, dtype='${config.rewardFunctionStateParameter.dtype}') -<#else> - self.__last_received_state = data.data - + self.__last_received_state = np.array(data.data, dtype='float32') rospy.logdebug('Received state: {}'.format(self.__last_received_state)) self.__waiting_for_state_update = False def __terminal_state_callback(self, data): -<#if config.rewardFunctionTerminalParameter.isMultiDimensional> - self.__last_received_terminal = np.array(data.data, dtype='${config.rewardFunctionTerminalParameter.dtype}') -<#else> self.__last_received_terminal = data.data - rospy.logdebug('Received terminal flag: {}'.format(self.__last_received_terminal)) logger.debug('Received terminal: {}'.format(self.__last_received_terminal)) self.__waiting_for_terminal_update = False +<#if config.hasRosRewardTopic()> + def __reward_callback(self, data): + self.__last_received_reward = float(data.data) + logger.debug('Received reward: {}'.format(self.__last_received_reward)) + self.__waiting_for_reward_update = False +<#else> def __calc_reward(self, state, terminal): # C++ Wrapper call return self._reward_function.reward(state, terminal) - \ No newline at end of file + + diff --git a/src/main/resources/templates/gluon/reinforcement/params/DdpgAgentParams.ftl b/src/main/resources/templates/gluon/reinforcement/params/DdpgAgentParams.ftl new file mode 100644 index 0000000..ec22fc2 --- /dev/null +++ b/src/main/resources/templates/gluon/reinforcement/params/DdpgAgentParams.ftl @@ -0,0 +1,19 @@ + 'actor': actor_creator.net, + 'critic': critic_creator.net, +<#if (config.softTargetUpdateRate)??> + 'soft_target_update_rate': ${config.softTargetUpdateRate}, + +<#if (config.configuration.optimizer)??> + 'actor_optimizer': '${config.optimizerName}', + 'actor_optimizer_params': { +<#list config.optimizerParams?keys as param> + '${param}': ${config.optimizerParams[param]}<#sep>, +}, + +<#if (config.configuration.criticOptimizer)??> + 'critic_optimizer': '${config.criticOptimizerName}', + 'critic_optimizer_params': { +<#list config.criticOptimizerParams?keys as param> + '${param}': ${config.criticOptimizerParams[param]}<#sep>, +}, + diff --git a/src/main/resources/templates/gluon/reinforcement/params/DqnAgentParams.ftl b/src/main/resources/templates/gluon/reinforcement/params/DqnAgentParams.ftl new file mode 100644 index 0000000..67f2dfb --- /dev/null +++ b/src/main/resources/templates/gluon/reinforcement/params/DqnAgentParams.ftl @@ -0,0 +1,23 @@ + 'qnet':qnet_creator.net, +<#if (config.useFixTargetNetwork)?? && config.useFixTargetNetwork> + 'use_fix_target': True, + 'target_update_interval': ${config.targetNetworkUpdateInterval}, +<#else> + 'use_fix_target': False, + +<#if (config.loss)??> + 'loss_function': '${config.loss}', + +<#if (config.configuration.optimizer)??> + 'optimizer': '${config.optimizerName}', + 'optimizer_params': { +<#list config.optimizerParams?keys as param> + '${param}': ${config.optimizerParams[param]}<#sep>, + + }, + +<#if (config.useDoubleDqn)?? && config.useDoubleDqn> + 'double_dqn': True, +<#else> + 'double_dqn': False, + diff --git a/src/main/resources/templates/gluon/reinforcement/params/ReplayMemoryParams.ftl b/src/main/resources/templates/gluon/reinforcement/params/ReplayMemoryParams.ftl new file mode 100644 index 0000000..944187f --- /dev/null +++ b/src/main/resources/templates/gluon/reinforcement/params/ReplayMemoryParams.ftl @@ -0,0 +1,14 @@ +<#if (config.replayMemory)??> + 'method': '${config.replayMemory.method}', +<#if (config.replayMemory.memory_size)??> + 'memory_size': ${config.replayMemory.memory_size}, + +<#if (config.replayMemory.sample_size)??> + 'sample_size': ${config.replayMemory.sample_size}, + +<#else> + 'method': 'online', + + 'state_dtype': 'float32', + 'action_dtype': <#if config.rlAlgorithm=="DQN">'uint8'<#else>'float32', + 'rewards_dtype': 'float32' diff --git a/src/main/resources/templates/gluon/reinforcement/params/StrategyParams.ftl b/src/main/resources/templates/gluon/reinforcement/params/StrategyParams.ftl new file mode 100644 index 0000000..836a951 --- /dev/null +++ b/src/main/resources/templates/gluon/reinforcement/params/StrategyParams.ftl @@ -0,0 +1,39 @@ +<#if (config.strategy)??> + 'method':'${config.strategy.method}', + +<#if (config.strategy.epsilon)??> + 'epsilon': ${config.strategy.epsilon}, + +<#if (config.strategy.min_epsilon)??> + 'min_epsilon': ${config.strategy.min_epsilon}, + +<#if (config.strategy.epsilon_decay_method)??> + 'epsilon_decay_method': '${config.strategy.epsilon_decay_method}', + +<#if (config.strategy.epsilon_decay)??> + 'epsilon_decay': ${config.strategy.epsilon_decay}, + +<#if (config.strategy.epsilon_decay_start)??> + 'epsilon_decay_start': ${config.strategy.epsilon_decay_start}, + +<#if (config.strategy.method)?? && (config.strategy.method=="ornstein_uhlenbeck")> +<#if (config.strategy.action_low)?? > + 'action_low': ${config.strategy.action_low}, +<#else> + 'action_low': -np.infty, + +<#if (config.strategy.action_high)?? > + 'action_high': ${config.strategy.action_high}, +<#else> + 'action_high' : np.infty, + +<#if (config.strategy.mu)??> + 'mu': [<#list config.strategy.mu as m>${m}<#if m?has_next>, ], + +<#if (config.strategy.theta)??> + 'theta': [<#list config.strategy.theta as t>${t}<#if t?has_next>, ], + +<#if (config.strategy.sigma)??> + 'sigma': [<#list config.strategy.sigma as s>${s}<#if s?has_next>, ], + + diff --git a/src/main/resources/templates/gluon/reinforcement/util/Logger.ftl b/src/main/resources/templates/gluon/reinforcement/util/Logger.ftl new file mode 100644 index 0000000..19f36b1 --- /dev/null +++ b/src/main/resources/templates/gluon/reinforcement/util/Logger.ftl @@ -0,0 +1,95 @@ +import logging +import sys +import os +import util + + +class ArchLogger(object): + _logger = None + + __output_level = logging.INFO + __logger_name = 'agent' + __output_directory = '.' + __append = True + __logformat = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + __dateformat = '%d-%b-%y %H:%M:%S' + + INFO = logging.INFO + DEBUG = logging.DEBUG + + @staticmethod + def set_output_level(output_level): + assert output_level is not None + ArchLogger.__output_level = output_level + + @staticmethod + def set_logger_name(logger_name): + assert logger_name is not None + ArchLogger.__logger_name = logger_name + + @staticmethod + def set_output_directory(output_directory): + assert output_directory is not None + ArchLogger.__output_directory = output_directory + + @staticmethod + def set_append(append): + assert append is not None + ArchLogger.__append = append + + @staticmethod + def set_log_format(logformat, dateformat): + assert logformat is not None + assert dateformat is not None + ArchLogger.__logformat = logformat + ArchLogger.__dateformat = dateformat + + @staticmethod + def init_logger(make_log_file=True): + assert ArchLogger._logger is None, 'Logger init already called' + filemode = 'a' if ArchLogger.__append else 'w' + formatter = logging.Formatter( + fmt=ArchLogger.__logformat, datefmt=ArchLogger.__dateformat) + + logger = logging.getLogger(ArchLogger.__logger_name) + logger.propagate = False + + if not logger.handlers: + logger.setLevel(ArchLogger.__output_level) + stream_handler = logging.StreamHandler(sys.stdout) + stream_handler.setLevel(ArchLogger.__output_level) + stream_handler.setFormatter(formatter) + logger.addHandler(stream_handler) + + if make_log_file: + util.make_directory_if_not_exist(ArchLogger.__output_directory) + log_file = os.path.join( + ArchLogger.__output_directory, + ArchLogger.__logger_name + '.log') + file_handler = logging.FileHandler(log_file, mode=filemode) + file_handler.setLevel(ArchLogger.__output_level) + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + ArchLogger._logger = logger + + @staticmethod + def get_logger(): + if ArchLogger._logger is None: + ArchLogger.init_logger() + assert ArchLogger._logger is not None + return ArchLogger._logger + +if __name__ == "__main__": + print('=== Test logger ===') + ArchLogger.set_logger_name('TestLogger') + ArchLogger.set_output_directory('test_log') + ArchLogger.init_logger() + logger = ArchLogger.get_logger() + logger.warning('This is a warning') + logger.debug('This is a debug information, which you should not see') + logger.info('This is a normal information') + assert os.path.exists('test_log')\ + and os.path.isfile(os.path.join('test_log', 'TestLogger.log')),\ + 'Test failed: No logfile exists' + import shutil + shutil.rmtree('test_log') \ No newline at end of file diff --git a/src/main/resources/templates/gluon/reinforcement/util/Util.ftl b/src/main/resources/templates/gluon/reinforcement/util/Util.ftl index 5857893..a1da1ce 100644 --- a/src/main/resources/templates/gluon/reinforcement/util/Util.ftl +++ b/src/main/resources/templates/gluon/reinforcement/util/Util.ftl @@ -5,9 +5,9 @@ import matplotlib.pyplot as plt from matplotlib import style import time import os -import mxnet +import mxnet as mx from mxnet import gluon, nd - +import cnnarch_logger LOSS_FUNCTIONS = { 'l1': gluon.loss.L1Loss(), @@ -16,17 +16,46 @@ LOSS_FUNCTIONS = { 'softmax_cross_entropy': gluon.loss.SoftmaxCrossEntropyLoss(), 'sigmoid_cross_entropy': gluon.loss.SigmoidBinaryCrossEntropyLoss()} -def copy_net(net, input_state_dim, ctx, tmp_filename='tmp.params'): + +def make_directory_if_not_exist(directory): + if not os.path.exists(directory): + os.makedirs(directory) + + +def copy_net(net, input_state_dim, ctx): + assert isinstance(net, gluon.HybridBlock) + assert type(net.__class__) is type + + net2 = net.__class__() + net2.collect_params().initialize(mx.init.Zero(), ctx=ctx) + net2.hybridize() + net2(mx.nd.ones((1,) + input_state_dim, ctx=ctx)) + + params_of_net = [p.data() for _, p in net.collect_params().items()] + for i, (_, p) in enumerate(net2.collect_params().items()): + p.set_data(params_of_net[i]) + + return net2 + + +def copy_net_with_two_inputs(net, input_state_dim1, input_state_dim2, ctx): assert isinstance(net, gluon.HybridBlock) assert type(net.__class__) is type - net.save_parameters(tmp_filename) + net2 = net.__class__() - net2.load_parameters(tmp_filename, ctx=ctx) - os.remove(tmp_filename) + net2.collect_params().initialize(mx.init.Zero(), ctx=ctx) net2.hybridize() - net2(nd.ones((1,) + input_state_dim, ctx=ctx)) + net2( + nd.ones((1,) + input_state_dim1, ctx=ctx), + nd.ones((1,) + input_state_dim2, ctx=ctx)) + + params_of_net = [p.data() for _, p in net.collect_params().items()] + for i, (_, p) in enumerate(net2.collect_params().items()): + p.set_data(params_of_net[i]) + return net2 + def get_loss_function(loss_function_name): if loss_function_name not in LOSS_FUNCTIONS: raise ValueError('Loss function does not exist') @@ -52,89 +81,196 @@ class AgentSignalHandler(object): sys.exit(1) style.use('fivethirtyeight') + + class TrainingStats(object): - def __init__(self, logger, max_episodes, live_plot=True): - self.__logger = logger - self.__max_episodes = max_episodes - self.__all_avg_loss = np.zeros((max_episodes,)) - self.__all_total_rewards = np.zeros((max_episodes,)) - self.__all_eps = np.zeros((max_episodes,)) - self.__all_time = np.zeros((max_episodes,)) - self.__all_mean_reward_last_100_episodes = np.zeros((max_episodes,)) - self.__live_plot = live_plot + def __init__(self, max_episodes): + self._logger = cnnarch_logger.ArchLogger.get_logger() + self._max_episodes = max_episodes + self._all_total_rewards = np.zeros((max_episodes,)) + self._all_eps = np.zeros((max_episodes,)) + self._all_time = np.zeros((max_episodes,)) + self._all_mean_reward_last_100_episodes = np.zeros((max_episodes,)) @property def logger(self): - return self.__logger + return self._logger @logger.setter def logger(self, logger): - self.__logger = logger + self._logger = logger @logger.deleter def logger(self): - self.__logger = None - - def add_avg_loss(self, episode, avg_loss): - self.__all_avg_loss[episode] = avg_loss + self._logger = None def add_total_reward(self, episode, total_reward): - self.__all_total_rewards[episode] = total_reward + self._all_total_rewards[episode] = total_reward def add_eps(self, episode, eps): - self.__all_eps[episode] = eps + self._all_eps[episode] = eps def add_time(self, episode, time): - self.__all_time[episode] = time + self._all_time[episode] = time def add_mean_reward_last_100(self, episode, mean_reward): - self.__all_mean_reward_last_100_episodes[episode] = mean_reward + self._all_mean_reward_last_100_episodes[episode] = mean_reward + + def log_episode(self, *args): + raise NotImplementedError + + def mean_of_reward(self, cur_episode, last=100): + if cur_episode > 0: + reward_last_100 =\ + self._all_total_rewards[max(0, cur_episode-last):cur_episode] + return np.mean(reward_last_100) + else: + return self._all_total_rewards[0] - def log_episode(self, episode, start_time, training_steps, loss, eps, reward): + def save(self, path): + np.save(os.path.join(path, 'total_rewards'), self._all_total_rewards) + np.save(os.path.join(path, 'eps'), self._all_eps) + np.save(os.path.join(path, 'time'), self._all_time) + np.save( + os.path.join(path, 'mean_reward'), + self._all_mean_reward_last_100_episodes) + + def _log_episode(self, episode, start_time, training_steps, eps, reward): self.add_eps(episode, eps) self.add_total_reward(episode, reward) end = time.time() - if training_steps == 0: - avg_loss = 0 - else: - avg_loss = float(loss)/float(training_steps) - mean_reward_last_100 = self.mean_of_reward(episode, last=100) - time_elapsed = end - start_time - info = "Episode: %d, Total Reward: %.3f, Avg. Reward Last 100 Episodes: %.3f, Avg Loss: %.3f, Time: %.3f, Training Steps: %d, Eps: %.3f"\ - % (episode, reward, mean_reward_last_100, avg_loss, time_elapsed, training_steps, eps) - self.__logger.info(info) - self.add_avg_loss(episode, avg_loss) self.add_time(episode, time_elapsed) self.add_mean_reward_last_100(episode, mean_reward_last_100) + return ('Episode: %d, Total Reward: %.3f, ' + 'Avg. Reward Last 100 Episodes: %.3f, {}, ' + 'Time: %.3f, Training Steps: %d, Eps: %.3f') % ( + episode, reward, mean_reward_last_100, time_elapsed, + training_steps, eps), mean_reward_last_100 - return avg_loss, time_elapsed, mean_reward_last_100 - def mean_of_reward(self, cur_episode, last=100): - if cur_episode > 0: - reward_last_100 = self.__all_total_rewards[max(0, cur_episode-last):cur_episode] - return np.mean(reward_last_100) - else: - return self.__all_total_rewards[0] +class DqnTrainingStats(TrainingStats): + def __init__(self, max_episodes): + super(DqnTrainingStats, self).__init__(max_episodes) + self._all_avg_loss = np.zeros((max_episodes,)) + + def add_avg_loss(self, episode, avg_loss): + self._all_avg_loss[episode] = avg_loss + + def log_episode( + self, episode, start_time, training_steps, avg_loss, eps, reward + ): + self.add_avg_loss(episode, avg_loss) + + info, avg_reward = self._log_episode( + episode, start_time, training_steps, eps, reward) + info = info.format(('Avg. Loss: %.3f') % (avg_loss)) - def save_stats(self, file): - fig = plt.figure(figsize=(20,20)) + self._logger.info(info) + return avg_reward + + def save_stats(self, path): + fig = plt.figure(figsize=(20, 20)) sub_rewards = fig.add_subplot(221) sub_rewards.set_title('Total Rewards per episode') - sub_rewards.plot(np.arange(self.__max_episodes), self.__all_total_rewards) + sub_rewards.plot( + np.arange(self._max_episodes), self._all_total_rewards) sub_loss = fig.add_subplot(222) sub_loss.set_title('Avg. Loss per episode') - sub_loss.plot(np.arange(self.__max_episodes), self.__all_avg_loss) + sub_loss.plot(np.arange(self._max_episodes), self._all_avg_loss) sub_eps = fig.add_subplot(223) sub_eps.set_title('Epsilon per episode') - sub_eps.plot(np.arange(self.__max_episodes), self.__all_eps) + sub_eps.plot(np.arange(self._max_episodes), self._all_eps) sub_rewards = fig.add_subplot(224) sub_rewards.set_title('Avg. mean reward of last 100 episodes') - sub_rewards.plot(np.arange(self.__max_episodes), self.__all_mean_reward_last_100_episodes) + sub_rewards.plot(np.arange(self._max_episodes), + self._all_mean_reward_last_100_episodes) + + self.save(path) + plt.savefig(os.path.join(path, 'stats.pdf')) + + def save(self, path): + super(DqnTrainingStats, self).save(path) + np.save(os.path.join(path, 'avg_loss'), self._all_avg_loss) + + +class DdpgTrainingStats(TrainingStats): + def __init__(self, max_episodes): + super(DdpgTrainingStats, self).__init__(max_episodes) + self._all_avg_critic_loss = np.zeros((max_episodes,)) + self._all_avg_actor_loss = np.zeros((max_episodes,)) + self._all_avg_qvalues = np.zeros((max_episodes,)) + + def add_avg_critic_loss(self, episode, avg_critic_loss): + self._all_avg_critic_loss[episode] = avg_critic_loss + + def add_avg_actor_loss(self, episode, avg_actor_loss): + self._all_avg_actor_loss[episode] = avg_actor_loss + + def add_avg_qvalues(self, episode, avg_qvalues): + self._all_avg_qvalues[episode] = avg_qvalues + + def log_episode( + self, episode, start_time, training_steps, actor_loss, + critic_loss, qvalues, eps, reward + ): + self.add_avg_actor_loss(episode, actor_loss) + self.add_avg_critic_loss(episode, critic_loss) + self.add_avg_qvalues(episode, qvalues) + + info, avg_reward = self._log_episode( + episode, start_time, training_steps, eps, reward) + info = info.format(( + 'Avg. Actor Loss: %.3f ' + 'Avg. Critic Loss: %.3f ' + 'Avg. Q-Values: %.3f') % (actor_loss, critic_loss, qvalues)) + + self.logger.info(info) + return avg_reward + + def save(self, path): + super(DdpgTrainingStats, self).save(path) + np.save(os.path.join( + path, 'avg_critic_loss'), self._all_avg_critic_loss) + np.save(os.path.join(path, 'avg_actor_loss'), self._all_avg_actor_loss) + np.save(os.path.join(path, 'avg_qvalues'), self._all_avg_qvalues) + + def save_stats(self, path): + fig = plt.figure(figsize=(120, 120)) + + sub_rewards = fig.add_subplot(321) + sub_rewards.set_title('Total Rewards per episode') + sub_rewards.plot( + np.arange(self._max_episodes), self._all_total_rewards) + + sub_actor_loss = fig.add_subplot(322) + sub_actor_loss.set_title('Avg. Actor Loss per episode') + sub_actor_loss.plot( + np.arange(self._max_episodes), self._all_avg_actor_loss) + + sub_critic_loss = fig.add_subplot(323) + sub_critic_loss.set_title('Avg. Critic Loss per episode') + sub_critic_loss.plot( + np.arange(self._max_episodes), self._all_avg_critic_loss) + + sub_qvalues = fig.add_subplot(324) + sub_qvalues.set_title('Avg. QValues per episode') + sub_qvalues.plot( + np.arange(self._max_episodes), self._all_avg_qvalues) + + sub_eps = fig.add_subplot(325) + sub_eps.set_title('Epsilon per episode') + sub_eps.plot(np.arange(self._max_episodes), self._all_eps) + + sub_rewards = fig.add_subplot(326) + sub_rewards.set_title('Avg. mean reward of last 100 episodes') + sub_rewards.plot(np.arange(self._max_episodes), + self._all_mean_reward_last_100_episodes) - plt.savefig(file) \ No newline at end of file + self.save(path) + plt.savefig(os.path.join(path, 'stats.pdf')) diff --git a/src/test/java/de/monticore/lang/monticar/cnnarch/gluongenerator/GenerationTest.java b/src/test/java/de/monticore/lang/monticar/cnnarch/gluongenerator/GenerationTest.java index abe4575..56ff0c6 100644 --- a/src/test/java/de/monticore/lang/monticar/cnnarch/gluongenerator/GenerationTest.java +++ b/src/test/java/de/monticore/lang/monticar/cnnarch/gluongenerator/GenerationTest.java @@ -20,7 +20,12 @@ */ package de.monticore.lang.monticar.cnnarch.gluongenerator; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Lists; import de.monticore.lang.monticar.cnnarch.gluongenerator.reinforcement.RewardFunctionSourceGenerator; +import de.monticore.lang.monticar.cnnarch.gluongenerator.util.TrainedArchitectureMockFactory; +import de.monticore.lang.monticar.cnntrain.annotations.Range; +import de.monticore.lang.monticar.cnntrain.annotations.TrainedArchitecture; import de.se_rwth.commons.logging.Finding; import de.se_rwth.commons.logging.Log; import freemarker.template.TemplateException; @@ -31,11 +36,13 @@ import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; +import java.util.List; import java.util.stream.Collector; import java.util.stream.Collectors; import static junit.framework.TestCase.assertTrue; import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; public class GenerationTest extends AbstractSymtabTest { private RewardFunctionSourceGenerator rewardFunctionSourceGenerator; @@ -179,11 +186,13 @@ public class GenerationTest extends AbstractSymtabTest { @Test public void testReinforcementConfig2() { + // given Log.getFindings().clear(); Path modelPath = Paths.get("src/test/resources/valid_tests"); CNNTrain2Gluon trainGenerator = new CNNTrain2Gluon(rewardFunctionSourceGenerator); + TrainedArchitecture trainedArchitecture = TrainedArchitectureMockFactory.createTrainedArchitectureMock(); - trainGenerator.generate(modelPath, "ReinforcementConfig2"); + trainGenerator.generate(modelPath, "ReinforcementConfig2", trainedArchitecture); assertTrue(Log.getFindings().isEmpty()); checkFilesAreEqual( @@ -193,11 +202,40 @@ public class GenerationTest extends AbstractSymtabTest { "CNNTrainer_reinforcementConfig2.py", "start_training.sh", "reinforcement_learning/__init__.py", - "reinforcement_learning/action_policy.py", + "reinforcement_learning/strategy.py", "reinforcement_learning/agent.py", "reinforcement_learning/environment.py", "reinforcement_learning/replay_memory.py", - "reinforcement_learning/util.py" + "reinforcement_learning/util.py", + "reinforcement_learning/cnnarch_logger.py" + ) + ); + } + + @Test + public void testReinforcementConfig3() { + // given + Log.getFindings().clear(); + Path modelPath = Paths.get("src/test/resources/valid_tests"); + CNNTrain2Gluon trainGenerator = new CNNTrain2Gluon(rewardFunctionSourceGenerator); + TrainedArchitecture trainedArchitecture = TrainedArchitectureMockFactory.createTrainedArchitectureMock(); + + trainGenerator.generate(modelPath, "ReinforcementConfig3", trainedArchitecture); + + assertTrue(Log.getFindings().isEmpty()); + checkFilesAreEqual( + Paths.get("./target/generated-sources-cnnarch"), + Paths.get("./src/test/resources/target_code/ReinforcementConfig3"), + Arrays.asList( + "CNNTrainer_reinforcementConfig3.py", + "start_training.sh", + "reinforcement_learning/__init__.py", + "reinforcement_learning/strategy.py", + "reinforcement_learning/agent.py", + "reinforcement_learning/environment.py", + "reinforcement_learning/replay_memory.py", + "reinforcement_learning/util.py", + "reinforcement_learning/cnnarch_logger.py" ) ); } @@ -226,4 +264,61 @@ public class GenerationTest extends AbstractSymtabTest { "FindArmadillo.cmake")); } + @Test + public void testDdpgConfig() { + Log.getFindings().clear(); + Path modelPath = Paths.get("src/test/resources/valid_tests/ddpg"); + CNNTrain2Gluon trainGenerator = new CNNTrain2Gluon(rewardFunctionSourceGenerator); + TrainedArchitecture trainedArchitecture = TrainedArchitectureMockFactory.createTrainedArchitectureMock(); + + trainGenerator.generate(modelPath, "ActorNetwork", trainedArchitecture); + + assertTrue(Log.getFindings().stream().noneMatch(Finding::isError)); + checkFilesAreEqual( + Paths.get("./target/generated-sources-cnnarch"), + Paths.get("./src/test/resources/target_code/ddpg"), + Arrays.asList( + "CNNTrainer_actorNetwork.py", + "start_training.sh", + "reinforcement_learning/CNNCreator_CriticNetwork.py", + "reinforcement_learning/CNNNet_CriticNetwork.py", + "reinforcement_learning/__init__.py", + "reinforcement_learning/strategy.py", + "reinforcement_learning/agent.py", + "reinforcement_learning/environment.py", + "reinforcement_learning/replay_memory.py", + "reinforcement_learning/util.py", + "reinforcement_learning/cnnarch_logger.py" + ) + ); + } + + @Test + public void testRosDdpgConfig() { + Log.getFindings().clear(); + Path modelPath = Paths.get("src/test/resources/valid_tests/ddpg-ros"); + CNNTrain2Gluon trainGenerator = new CNNTrain2Gluon(rewardFunctionSourceGenerator); + TrainedArchitecture trainedArchitecture = TrainedArchitectureMockFactory.createTrainedArchitectureMock(); + + trainGenerator.generate(modelPath, "RosActorNetwork", trainedArchitecture); + + assertTrue(Log.getFindings().stream().noneMatch(Finding::isError)); + checkFilesAreEqual( + Paths.get("./target/generated-sources-cnnarch"), + Paths.get("./src/test/resources/target_code/ros-ddpg"), + Arrays.asList( + "CNNTrainer_rosActorNetwork.py", + "start_training.sh", + "reinforcement_learning/CNNCreator_RosCriticNetwork.py", + "reinforcement_learning/CNNNet_RosCriticNetwork.py", + "reinforcement_learning/strategy.py", + "reinforcement_learning/agent.py", + "reinforcement_learning/environment.py", + "reinforcement_learning/replay_memory.py", + "reinforcement_learning/util.py", + "reinforcement_learning/cnnarch_logger.py" + ) + ); + } + } diff --git a/src/test/java/de/monticore/lang/monticar/cnnarch/gluongenerator/IntegrationPythonWrapperTest.java b/src/test/java/de/monticore/lang/monticar/cnnarch/gluongenerator/IntegrationPythonWrapperTest.java index 5456e75..98b6855 100644 --- a/src/test/java/de/monticore/lang/monticar/cnnarch/gluongenerator/IntegrationPythonWrapperTest.java +++ b/src/test/java/de/monticore/lang/monticar/cnnarch/gluongenerator/IntegrationPythonWrapperTest.java @@ -1,6 +1,8 @@ package de.monticore.lang.monticar.cnnarch.gluongenerator; import de.monticore.lang.monticar.cnnarch.gluongenerator.reinforcement.RewardFunctionSourceGenerator; +import de.monticore.lang.monticar.cnnarch.gluongenerator.util.TrainedArchitectureMockFactory; +import de.monticore.lang.monticar.cnntrain.annotations.TrainedArchitecture; import de.se_rwth.commons.logging.Finding; import de.se_rwth.commons.logging.Log; import org.junit.Before; @@ -31,7 +33,9 @@ public class IntegrationPythonWrapperTest extends AbstractSymtabTest{ Path modelPath = Paths.get("src/test/resources/valid_tests"); CNNTrain2Gluon trainGenerator = new CNNTrain2Gluon(rewardFunctionSourceGenerator); - trainGenerator.generate(modelPath, "ReinforcementConfig1"); + TrainedArchitecture trainedArchitecture = TrainedArchitectureMockFactory.createTrainedArchitectureMock(); + + trainGenerator.generate(modelPath, "ReinforcementConfig1", trainedArchitecture); assertTrue(Log.getFindings().stream().filter(Finding::isError).collect(Collectors.toList()).isEmpty()); checkFilesAreEqual( @@ -41,12 +45,12 @@ public class IntegrationPythonWrapperTest extends AbstractSymtabTest{ "CNNTrainer_reinforcementConfig1.py", "start_training.sh", "reinforcement_learning/__init__.py", - "reinforcement_learning/action_policy.py", + "reinforcement_learning/strategy.py", "reinforcement_learning/agent.py", "reinforcement_learning/environment.py", "reinforcement_learning/replay_memory.py", - "reinforcement_learning/util.py" - ) + "reinforcement_learning/util.py", + "reinforcement_learning/cnnarch_logger.py") ); assertTrue(Paths.get("./target/generated-sources-cnnarch/reward/pylib").toFile().isDirectory()); } diff --git a/src/test/java/de/monticore/lang/monticar/cnnarch/gluongenerator/util/TrainedArchitectureMockFactory.java b/src/test/java/de/monticore/lang/monticar/cnnarch/gluongenerator/util/TrainedArchitectureMockFactory.java new file mode 100644 index 0000000..7470b9d --- /dev/null +++ b/src/test/java/de/monticore/lang/monticar/cnnarch/gluongenerator/util/TrainedArchitectureMockFactory.java @@ -0,0 +1,48 @@ +package de.monticore.lang.monticar.cnnarch.gluongenerator.util; + +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Lists; +import de.monticore.lang.monticar.cnntrain.annotations.Range; +import de.monticore.lang.monticar.cnntrain.annotations.TrainedArchitecture; + +import java.util.List; +import java.util.Map; + +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class TrainedArchitectureMockFactory { + + public static TrainedArchitecture createTrainedArchitectureMock() { + TrainedArchitecture trainedArchitecture = mock(TrainedArchitecture.class); + + final String inputName = "state"; + final String outputName = "action"; + + Map> dimensionMap = ImmutableMap.> builder() + .put(inputName, Lists.newArrayList(8)) + .put(outputName, Lists.newArrayList(3)) + .build(); + + Range stateRange = Range.withInfinityLimits(); + Range actionRange = Range.withLimits(-1, 1); + Map rangeMap = ImmutableMap.builder() + .put(inputName, stateRange) + .put(outputName, actionRange) + .build(); + + Map typeMap = ImmutableMap.builder() + .put(inputName, "Q") + .put(outputName, "Q") + .build(); + + when(trainedArchitecture.getInputs()).thenReturn(Lists.newArrayList(inputName)); + when(trainedArchitecture.getOutputs()).thenReturn(Lists.newArrayList(outputName)); + when(trainedArchitecture.getDimensions()).thenReturn(dimensionMap); + when(trainedArchitecture.getRanges()).thenReturn(rangeMap); + when(trainedArchitecture.getTypes()).thenReturn(typeMap); + + return trainedArchitecture; + } + +} diff --git a/src/test/resources/target_code/CNNNet_Alexnet.py b/src/test/resources/target_code/CNNNet_Alexnet.py index 3a341d1..c60d7b8 100644 --- a/src/test/resources/target_code/CNNNet_Alexnet.py +++ b/src/test/resources/target_code/CNNNet_Alexnet.py @@ -74,9 +74,9 @@ class Net(gluon.HybridBlock): with self.name_scope(): if not data_mean is None: assert(not data_std is None) - self.input_normalization = ZScoreNormalization(data_mean=data_mean, data_std=data_std) + self.data_input_normalization = ZScoreNormalization(data_mean=data_mean, data_std=data_std) else: - self.input_normalization = NoNormalization() + self.data_input_normalization = NoNormalization() self.conv1_padding = Padding(padding=(0,0,0,0,2,1,2,1)) self.conv1_ = gluon.nn.Conv2D(channels=96, @@ -197,8 +197,8 @@ class Net(gluon.HybridBlock): self.last_layer = 'softmax' - def hybrid_forward(self, F, x): - data = self.input_normalization(x) + def hybrid_forward(self, F, data): + data = self.data_input_normalization(data) conv1_padding = self.conv1_padding(data) conv1_ = self.conv1_(conv1_padding) lrn1_ = F.LRN(data=conv1_, diff --git a/src/test/resources/target_code/CNNNet_CifarClassifierNetwork.py b/src/test/resources/target_code/CNNNet_CifarClassifierNetwork.py index 6c77164..8afc814 100644 --- a/src/test/resources/target_code/CNNNet_CifarClassifierNetwork.py +++ b/src/test/resources/target_code/CNNNet_CifarClassifierNetwork.py @@ -74,9 +74,9 @@ class Net(gluon.HybridBlock): with self.name_scope(): if not data_mean is None: assert(not data_std is None) - self.input_normalization = ZScoreNormalization(data_mean=data_mean, data_std=data_std) + self.data_input_normalization = ZScoreNormalization(data_mean=data_mean, data_std=data_std) else: - self.input_normalization = NoNormalization() + self.data_input_normalization = NoNormalization() self.conv2_1_padding = Padding(padding=(0,0,0,0,1,1,1,1)) self.conv2_1_ = gluon.nn.Conv2D(channels=8, @@ -348,8 +348,8 @@ class Net(gluon.HybridBlock): self.last_layer = 'softmax' - def hybrid_forward(self, F, x): - data = self.input_normalization(x) + def hybrid_forward(self, F, data): + data = self.data_input_normalization(data) conv2_1_padding = self.conv2_1_padding(data) conv2_1_ = self.conv2_1_(conv2_1_padding) batchnorm2_1_ = self.batchnorm2_1_(conv2_1_) diff --git a/src/test/resources/target_code/CNNNet_VGG16.py b/src/test/resources/target_code/CNNNet_VGG16.py index 17367ab..0fd4c5f 100644 --- a/src/test/resources/target_code/CNNNet_VGG16.py +++ b/src/test/resources/target_code/CNNNet_VGG16.py @@ -74,9 +74,9 @@ class Net(gluon.HybridBlock): with self.name_scope(): if not data_mean is None: assert(not data_std is None) - self.input_normalization = ZScoreNormalization(data_mean=data_mean, data_std=data_std) + self.data_input_normalization = ZScoreNormalization(data_mean=data_mean, data_std=data_std) else: - self.input_normalization = NoNormalization() + self.data_input_normalization = NoNormalization() self.conv1_padding = Padding(padding=(0,0,0,0,1,1,1,1)) self.conv1_ = gluon.nn.Conv2D(channels=64, @@ -225,8 +225,8 @@ class Net(gluon.HybridBlock): self.last_layer = 'softmax' - def hybrid_forward(self, F, x): - data = self.input_normalization(x) + def hybrid_forward(self, F, data): + data = self.data_input_normalization(data) conv1_padding = self.conv1_padding(data) conv1_ = self.conv1_(conv1_padding) relu1_ = self.relu1_(conv1_) diff --git a/src/test/resources/target_code/ReinforcementConfig1/CNNTrainer_reinforcementConfig1.py b/src/test/resources/target_code/ReinforcementConfig1/CNNTrainer_reinforcementConfig1.py index 09c905c..f9095be 100644 --- a/src/test/resources/target_code/ReinforcementConfig1/CNNTrainer_reinforcementConfig1.py +++ b/src/test/resources/target_code/ReinforcementConfig1/CNNTrainer_reinforcementConfig1.py @@ -1,29 +1,27 @@ from reinforcement_learning.agent import DqnAgent from reinforcement_learning.util import AgentSignalHandler +from reinforcement_learning.cnnarch_logger import ArchLogger import reinforcement_learning.environment import CNNCreator_reinforcementConfig1 import os import sys import re -import logging +import time +import numpy as np import mxnet as mx -session_output_dir = 'session' -agent_name='reinforcement_agent' -session_param_output = os.path.join(session_output_dir, agent_name) -def resume_session(): - session_param_output = os.path.join(session_output_dir, agent_name) +def resume_session(sessions_dir): resume_session = False resume_directory = None - if os.path.isdir(session_output_dir) and os.path.isdir(session_param_output): + if os.path.isdir(sessions_dir): regex = re.compile(r'\d\d\d\d-\d\d-\d\d-\d\d-\d\d') - dir_content = os.listdir(session_param_output) + dir_content = os.listdir(sessions_dir) session_files = filter(regex.search, dir_content) session_files.sort(reverse=True) for d in session_files: - interrupted_session_dir = os.path.join(session_param_output, d, '.interrupted_session') + interrupted_session_dir = os.path.join(sessions_dir, d, '.interrupted_session') if os.path.isdir(interrupted_session_dir): resume = raw_input('Interrupted session from {} found. Do you want to resume? (y/n) '.format(d)) if resume == 'y': @@ -32,65 +30,82 @@ def resume_session(): break return resume_session, resume_directory + if __name__ == "__main__": + agent_name = 'reinforcement_agent' + # Prepare output directory and logger + all_output_dir = os.path.join('model', agent_name) + output_directory = os.path.join( + all_output_dir, + time.strftime('%Y-%m-%d-%H-%M-%S', + time.localtime(time.time()))) + ArchLogger.set_output_directory(output_directory) + ArchLogger.set_logger_name(agent_name) + ArchLogger.set_output_level(ArchLogger.INFO) + env_params = { - 'ros_node_name' : 'reinforcementConfig1TrainerNode', - 'state_topic' : '/environment/state', - 'action_topic' : '/environment/action', - 'reset_topic' : '/environment/reset', + 'ros_node_name': 'reinforcementConfig1TrainerNode', + 'state_topic': '/environment/state', + 'action_topic': '/environment/action', + 'reset_topic': '/environment/reset', } env = reinforcement_learning.environment.RosEnvironment(**env_params) - context = mx.cpu() - net_creator = CNNCreator_reinforcementConfig1.CNNCreator_reinforcementConfig1() - net_creator.construct(context) - replay_memory_params = { - 'method':'buffer', - 'memory_size':1000000, - 'sample_size':64, - 'state_dtype':'float32', - 'action_dtype':'uint8', - 'rewards_dtype':'float32' - } + context = mx.cpu() + qnet_creator = CNNCreator_reinforcementConfig1.CNNCreator_reinforcementConfig1() + qnet_creator.construct(context) - policy_params = { - 'method':'epsgreedy', - 'epsilon': 1, - 'min_epsilon': 0.02, - 'epsilon_decay_method': 'linear', - 'epsilon_decay': 0.0001, + agent_params = { + 'environment': env, + 'replay_memory_params': { + 'method': 'buffer', + 'memory_size': 1000000, + 'sample_size': 64, + 'state_dtype': 'float32', + 'action_dtype': 'float32', + 'rewards_dtype': 'float32' + }, + 'strategy_params': { + 'method':'epsgreedy', + 'epsilon': 1, + 'min_epsilon': 0.02, + 'epsilon_decay_method': 'linear', + 'epsilon_decay': 0.0001, + }, + 'agent_name': agent_name, + 'verbose': True, + 'output_directory': output_directory, + 'state_dim': (8,), + 'action_dim': (3,), + 'discount_factor': 0.99999, + 'training_episodes': 1000, + 'train_interval': 1, + 'snapshot_interval': 500, + 'max_episode_step': 10000, + 'target_score': 35000, + 'qnet':qnet_creator.net, + 'use_fix_target': True, + 'target_update_interval': 500, + 'loss_function': 'huber_loss', + 'optimizer': 'adam', + 'optimizer_params': { + 'learning_rate': 0.001 }, + 'double_dqn': True, } - resume_session, resume_directory = resume_session() + resume, resume_directory = resume_session(all_output_dir) - if resume_session: - agent = DqnAgent.resume_from_session(resume_directory, net_creator.net, env) + if resume: + output_directory, _ = os.path.split(resume_directory) + ArchLogger.set_output_directory(output_directory) + resume_agent_params = { + 'session_dir': resume_directory, + 'environment': env, + 'net': qnet_creator.net, + } + agent = DqnAgent.resume_from_session(**resume_agent_params) else: - agent = DqnAgent( - network = net_creator.net, - environment=env, - replay_memory_params=replay_memory_params, - policy_params=policy_params, - state_dim=net_creator.get_input_shapes()[0], - discount_factor=0.99999, - loss_function='huber_loss', - optimizer='adam', - optimizer_params={ - 'learning_rate': 0.001 }, - training_episodes=1000, - train_interval=1, - use_fix_target=True, - target_update_interval=500, - double_dqn = True, - snapshot_interval=500, - agent_name=agent_name, - max_episode_step=10000, - output_directory=session_output_dir, - verbose=True, - live_plot = True, - make_logfile=True, - target_score=35000 - ) + agent = DqnAgent(**agent_params) signal_handler = AgentSignalHandler() signal_handler.register_agent(agent) @@ -98,4 +113,4 @@ if __name__ == "__main__": train_successful = agent.train() if train_successful: - agent.save_best_network(net_creator._model_dir_ + net_creator._model_prefix_ + '_newest', epoch=0) \ No newline at end of file + agent.save_best_network(qnet_creator._model_dir_ + qnet_creator._model_prefix_ + '_newest', epoch=0) diff --git a/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/action_policy.py b/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/action_policy.py deleted file mode 100644 index f43a211..0000000 --- a/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/action_policy.py +++ /dev/null @@ -1,73 +0,0 @@ -import numpy as np - -class ActionPolicyBuilder(object): - def __init__(self): - pass - - def build_by_params(self, - method='epsgreedy', - epsilon=0.5, - min_epsilon=0.05, - epsilon_decay_method='no', - epsilon_decay=0.0, - action_dim=None): - - if epsilon_decay_method == 'linear': - decay = LinearDecay(eps_decay=epsilon_decay, min_eps=min_epsilon) - else: - decay = NoDecay() - - if method == 'epsgreedy': - assert action_dim is not None - assert len(action_dim) == 1 - return EpsilonGreedyActionPolicy(eps=epsilon, - number_of_actions=action_dim[0], decay=decay) - else: - assert action_dim is not None - assert len(action_dim) == 1 - return GreedyActionPolicy() - -class EpsilonGreedyActionPolicy(object): - def __init__(self, eps, number_of_actions, decay): - self.eps = eps - self.cur_eps = eps - self.__number_of_actions = number_of_actions - self.__decay_method = decay - - def select_action(self, values): - do_exploration = (np.random.rand() < self.cur_eps) - if do_exploration: - action = np.random.randint(low=0, high=self.__number_of_actions) - else: - action = values.asnumpy().argmax() - return action - - def decay(self): - self.cur_eps = self.__decay_method.decay(self.cur_eps) - - -class GreedyActionPolicy(object): - def __init__(self): - pass - - def select_action(self, values): - return values.asnumpy().argmax() - - def decay(self): - pass - - -class NoDecay(object): - def __init__(self): - pass - - def decay(self, cur_eps): - return cur_eps - -class LinearDecay(object): - def __init__(self, eps_decay, min_eps=0): - self.eps_decay = eps_decay - self.min_eps = min_eps - - def decay(self, cur_eps): - return max(cur_eps - self.eps_decay, self.min_eps) \ No newline at end of file diff --git a/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/agent.py b/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/agent.py index acbc974..c06b32a 100644 --- a/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/agent.py +++ b/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/agent.py @@ -2,118 +2,382 @@ import mxnet as mx import numpy as np import time import os -import logging import sys import util import matplotlib.pyplot as plt +import pyprind +from cnnarch_logger import ArchLogger from replay_memory import ReplayMemoryBuilder -from action_policy import ActionPolicyBuilder -from util import copy_net, get_loss_function +from strategy import StrategyBuilder +from util import copy_net, get_loss_function,\ + copy_net_with_two_inputs, DdpgTrainingStats, DqnTrainingStats,\ + make_directory_if_not_exist from mxnet import nd, gluon, autograd -class DqnAgent(object): - def __init__(self, - network, + +class Agent(object): + def __init__( + self, environment, replay_memory_params, - policy_params, + strategy_params, state_dim, + action_dim, ctx=None, discount_factor=.9, - loss_function='euclidean', - optimizer='rmsprop', - optimizer_params = {'learning_rate':0.09}, training_episodes=50, train_interval=1, - use_fix_target=False, - double_dqn = False, - target_update_interval=10, + start_training=0, snapshot_interval=200, - agent_name='Dqn_agent', + agent_name='Agent', max_episode_step=99999, + evaluation_samples=1000, output_directory='model_parameters', verbose=True, - live_plot = True, - make_logfile=True, - target_score=None): - assert 0 < discount_factor <= 1 - assert train_interval > 0 - assert target_update_interval > 0 - assert snapshot_interval > 0 - assert max_episode_step > 0 - assert training_episodes > 0 - assert replay_memory_params is not None - assert type(state_dim) is tuple - - self.__ctx = mx.gpu() if ctx == 'gpu' else mx.cpu() - self.__qnet = network - - self.__environment = environment - self.__discount_factor = discount_factor - self.__training_episodes = training_episodes - self.__train_interval = train_interval - self.__verbose = verbose - self.__state_dim = state_dim - self.__action_dim = self.__qnet(nd.random_normal(shape=((1,) + self.__state_dim), ctx=self.__ctx)).shape[1:] + target_score=None + ): + assert 0 < discount_factor <= 1,\ + 'Discount factor must be between 0 and 1' + assert train_interval > 0, 'Train interval must be greater 0' + assert snapshot_interval > 0, 'Snapshot interval must be greater 0' + assert max_episode_step > 0,\ + 'Maximal steps per episode must be greater 0' + assert training_episodes > 0, 'Trainings episode must be greater 0' + assert replay_memory_params is not None,\ + 'Replay memory parameter not set' + assert type(state_dim) is tuple, 'State dimension is not a tuple' + assert type(action_dim) is tuple, 'Action dimension is not a tuple' + + self._logger = ArchLogger.get_logger() + self._ctx = mx.gpu() if ctx == 'gpu' else mx.cpu() + self._environment = environment + self._discount_factor = discount_factor + self._training_episodes = training_episodes + self._train_interval = train_interval + self._verbose = verbose + self._state_dim = state_dim replay_memory_params['state_dim'] = state_dim - self.__replay_memory_params = replay_memory_params + replay_memory_params['action_dim'] = action_dim + self._replay_memory_params = replay_memory_params rm_builder = ReplayMemoryBuilder() - self.__memory = rm_builder.build_by_params(**replay_memory_params) - self.__minibatch_size = self.__memory.sample_size - - policy_params['action_dim'] = self.__action_dim - self.__policy_params = policy_params - p_builder = ActionPolicyBuilder() - self.__policy = p_builder.build_by_params(**policy_params) - - self.__target_update_interval = target_update_interval - self.__target_qnet = copy_net(self.__qnet, self.__state_dim, ctx=self.__ctx) - self.__loss_function_str = loss_function - self.__loss_function = get_loss_function(loss_function) - self.__agent_name = agent_name - self.__snapshot_interval = snapshot_interval - self.__creation_time = time.time() - self.__max_episode_step = max_episode_step - self.__optimizer = optimizer - self.__optimizer_params = optimizer_params - self.__make_logfile = make_logfile - self.__double_dqn = double_dqn - self.__use_fix_target = use_fix_target - self.__live_plot = live_plot - self.__user_given_directory = output_directory - self.__target_score = target_score - - self.__interrupt_flag = False + self._memory = rm_builder.build_by_params(**replay_memory_params) + self._minibatch_size = self._memory.sample_size + self._action_dim = action_dim + + strategy_params['action_dim'] = self._action_dim + self._strategy_params = strategy_params + strategy_builder = StrategyBuilder() + self._strategy = strategy_builder.build_by_params(**strategy_params) + self._agent_name = agent_name + self._snapshot_interval = snapshot_interval + self._creation_time = time.time() + self._max_episode_step = max_episode_step + self._start_training = start_training + self._output_directory = output_directory + self._target_score = target_score + + self._evaluation_samples = evaluation_samples + self._best_avg_score = -np.infty + self._best_net = None + + self._interrupt_flag = False + self._training_stats = None # Training Context - self.__current_episode = 0 - self.__total_steps = 0 - - # Initialize best network - self.__best_net = copy_net(self.__qnet, self.__state_dim, self.__ctx) - self.__best_avg_score = None + self._current_episode = 0 + self._total_steps = 0 - # Gluon Trainer definition - self.__training_stats = None + @property + def current_episode(self): + return self._current_episode - # Prepare output directory and logger - self.__output_directory = output_directory\ - + '/' + self.__agent_name\ - + '/' + time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(self.__creation_time)) - self.__logger = self.__setup_logging() - self.__logger.info('Agent created with following parameters: {}'.format(self.__make_config_dict())) + @property + def environment(self): + return self._environment - @classmethod - def from_config_file(cls, network, environment, config_file_path, ctx=None): + def save_config_file(self): import json - # Load config - with open(config_file_path, 'r') as config_file: - config_dict = json.load(config_file) - return cls(network, environment, ctx=ctx, **config_dict) + make_directory_if_not_exist(self._output_directory) + filename = os.path.join(self._output_directory, 'config.json') + config = self._make_config_dict() + with open(filename, mode='w') as fp: + json.dump(config, fp, indent=4) + + def set_interrupt_flag(self, interrupt): + self._interrupt_flag = interrupt + + def _interrupt_training(self): + import pickle + self._logger.info('Training interrupted; Store state for resuming') + session_dir = self._get_session_dir() + agent_session_file = os.path.join(session_dir, 'agent.p') + logger = self._logger + + self._make_pickle_ready(session_dir) + + with open(agent_session_file, 'wb') as f: + pickle.dump(self, f, protocol=2) + logger.info('State successfully stored') + + def _make_pickle_ready(self, session_dir): + del self._training_stats.logger + self._logger = None + self._environment.close() + self._environment = None + self._save_net(self._best_net, 'best_net', session_dir) + self._best_net = None + + def _make_config_dict(self): + config = dict() + config['state_dim'] = self._state_dim + config['action_dim'] = self._action_dim + config['ctx'] = str(self._ctx) + config['discount_factor'] = self._discount_factor + config['strategy_params'] = self._strategy_params + config['replay_memory_params'] = self._replay_memory_params + config['training_episodes'] = self._training_episodes + config['start_training'] = self._start_training + config['evaluation_samples'] = self._evaluation_samples + config['train_interval'] = self._train_interval + config['snapshot_interval'] = self._snapshot_interval + config['agent_name'] = self._agent_name + config['max_episode_step'] = self._max_episode_step + config['output_directory'] = self._output_directory + config['verbose'] = self._verbose + config['target_score'] = self._target_score + return config + + def _adjust_optimizer_params(self, optimizer_params): + if 'weight_decay' in optimizer_params: + optimizer_params['wd'] = optimizer_params['weight_decay'] + del optimizer_params['weight_decay'] + if 'learning_rate_decay' in optimizer_params: + min_learning_rate = 1e-8 + if 'learning_rate_minimum' in optimizer_params: + min_learning_rate = optimizer_params['learning_rate_minimum'] + del optimizer_params['learning_rate_minimum'] + optimizer_params['lr_scheduler'] = mx.lr_scheduler.FactorScheduler( + optimizer_params['step_size'], + factor=optimizer_params['learning_rate_decay'], + stop_factor_lr=min_learning_rate) + del optimizer_params['step_size'] + del optimizer_params['learning_rate_decay'] + + return optimizer_params + + def _sample_from_memory(self): + states, actions, rewards, next_states, terminals\ + = self._memory.sample(batch_size=self._minibatch_size) + states = nd.array(states, ctx=self._ctx) + actions = nd.array(actions, ctx=self._ctx) + rewards = nd.array(rewards, ctx=self._ctx) + next_states = nd.array(next_states, ctx=self._ctx) + terminals = nd.array(terminals, ctx=self._ctx) + return states, actions, rewards, next_states, terminals + + def evaluate(self, target=None, sample_games=100, verbose=True): + target = self._target_score if target is None else target + if target: + target_achieved = 0 + total_reward = 0 + + self._logger.info('Sampling from {} games...'.format(sample_games)) + for g in pyprind.prog_bar(range(sample_games)): + state = self._environment.reset() + step = 0 + game_reward = 0 + terminal = False + while not terminal and (step < self._max_episode_step): + action = self.get_next_action(state) + state, reward, terminal, _ = self._environment.step(action) + game_reward += reward + step += 1 + + if verbose: + info = 'Game %d: Reward %f' % (g, game_reward) + self._logger.debug(info) + if target: + if game_reward >= target: + target_achieved += 1 + total_reward += game_reward + + avg_reward = float(total_reward)/float(sample_games) + info = 'Avg. Reward: %f' % avg_reward + if target: + target_achieved_ratio = int( + (float(target_achieved)/float(sample_games))*100) + info += '; Target Achieved in %d%% of games'\ + % (target_achieved_ratio) + + if verbose: + self._logger.info(info) + return avg_reward + + def _do_snapshot_if_in_interval(self, episode): + do_snapshot =\ + (episode != 0 and (episode % self._snapshot_interval == 0)) + if do_snapshot: + self.save_parameters(episode=episode) + self._evaluate() + + def _evaluate(self, verbose=True): + avg_reward = self.evaluate( + sample_games=self._evaluation_samples, verbose=False) + info = 'Evaluation -> Average Reward in {} games: {}'.format( + self._evaluation_samples, avg_reward) + + if self._best_avg_score is None or self._best_avg_score <= avg_reward: + self._save_current_as_best_net() + self._best_avg_score = avg_reward + if verbose: + self._logger.info(info) + + def _is_target_reached(self, avg_reward): + return self._target_score is not None\ + and avg_reward > self._target_score + + def _do_training(self): + return (self._total_steps % self._train_interval == 0) and\ + (self._memory.is_sample_possible(self._minibatch_size)) and\ + (self._current_episode >= self._start_training) + + def _check_interrupt_routine(self): + if self._interrupt_flag: + self._interrupt_flag = False + self._interrupt_training() + return True + return False + + def _is_target_reached(self, avg_reward): + return self._target_score is not None\ + and avg_reward > self._target_score + + def _save_parameters(self, net, episode=None, filename='dqn-agent-params'): + assert self._output_directory + assert isinstance(net, gluon.HybridBlock) + make_directory_if_not_exist(self._output_directory) + + if(episode is not None): + self._logger.info( + 'Saving model parameters after episode %d' % episode) + filename = filename + '-ep{}'.format(episode) + else: + self._logger.info('Saving model parameters') + self._save_net(net, filename) + + def _save_net(self, net, filename, filedir=None): + filedir = self._output_directory if filedir is None else filedir + filename = os.path.join(filedir, filename + '.params') + net.save_parameters(filename) + + def save_best_network(self, path, epoch=0): + self._logger.info( + 'Saving best network with average reward of {}'.format( + self._best_avg_score)) + self._best_net.export(path, epoch=epoch) + + def _get_session_dir(self): + session_dir = os.path.join( + self._output_directory, '.interrupted_session') + make_directory_if_not_exist(session_dir) + return session_dir + + def _save_current_as_best_net(self): + raise NotImplementedError + + def get_next_action(self, state): + raise NotImplementedError + + def save_parameters(self, episode): + raise NotImplementedError + + def train(self, episodes=None): + raise NotImplementedError + + +class DdpgAgent(Agent): + def __init__( + self, + actor, + critic, + environment, + replay_memory_params, + strategy_params, + state_dim, + action_dim, + soft_target_update_rate=.001, + actor_optimizer='adam', + actor_optimizer_params={'learning_rate': 0.0001}, + critic_optimizer='adam', + critic_optimizer_params={'learning_rate': 0.001}, + ctx=None, + discount_factor=.9, + training_episodes=50, + start_training=20, + train_interval=1, + snapshot_interval=200, + agent_name='DdpgAgent', + max_episode_step=9999, + evaluation_samples=100, + output_directory='model_parameters', + verbose=True, + target_score=None + ): + super(DdpgAgent, self).__init__( + environment=environment, replay_memory_params=replay_memory_params, + strategy_params=strategy_params, state_dim=state_dim, + action_dim=action_dim, ctx=ctx, discount_factor=discount_factor, + training_episodes=training_episodes, start_training=start_training, + train_interval=train_interval, + snapshot_interval=snapshot_interval, agent_name=agent_name, + max_episode_step=max_episode_step, + output_directory=output_directory, verbose=verbose, + target_score=target_score, evaluation_samples=evaluation_samples) + assert critic is not None, 'Critic not set' + assert actor is not None, 'Actor is not set' + assert soft_target_update_rate > 0,\ + 'Target update must be greater zero' + assert actor_optimizer is not None, 'No actor optimizer set' + assert critic_optimizer is not None, 'No critic optimizer set' + + self._actor = actor + self._critic = critic + + self._actor_target = self._copy_actor() + self._critic_target = self._copy_critic() + + self._actor_optimizer = actor_optimizer + self._actor_optimizer_params = self._adjust_optimizer_params( + actor_optimizer_params) + + self._critic_optimizer = critic_optimizer + self._critic_optimizer_params = self._adjust_optimizer_params( + critic_optimizer_params) + + self._soft_target_update_rate = soft_target_update_rate + + self._logger.info( + 'Agent created with following parameters: {}'.format( + self._make_config_dict())) + + self._best_net = self._copy_actor() + + self._training_stats = DdpgTrainingStats(self._training_episodes) + + def _make_pickle_ready(self, session_dir): + super(DdpgAgent, self)._make_pickle_ready(session_dir) + self._save_net(self._actor, 'actor', session_dir) + self._actor = None + self._save_net(self._critic, 'critic', session_dir) + self._critic = None + self._save_net(self._actor_target, 'actor_target', session_dir) + self._actor_target = None + self._save_net(self._critic_target, 'critic_target', session_dir) + self._critic_target = None @classmethod - def resume_from_session(cls, session_dir, net, environment): + def resume_from_session(cls, session_dir, actor, critic, environment): import pickle if not os.path.exists(session_dir): raise ValueError('Session directory does not exist') @@ -121,386 +385,516 @@ class DqnAgent(object): files = dict() files['agent'] = os.path.join(session_dir, 'agent.p') files['best_net_params'] = os.path.join(session_dir, 'best_net.params') - files['q_net_params'] = os.path.join(session_dir, 'qnet.params') - files['target_net_params'] = os.path.join(session_dir, 'target_net.params') + files['actor_net_params'] = os.path.join(session_dir, 'actor.params') + files['actor_target_net_params'] = os.path.join( + session_dir, 'actor_target.params') + files['critic_net_params'] = os.path.join(session_dir, 'critic.params') + files['critic_target_net_params'] = os.path.join( + session_dir, 'critic_target.params') for file in files.values(): if not os.path.exists(file): - raise ValueError('Session directory is not complete: {} is missing'.format(file)) + raise ValueError( + 'Session directory is not complete: {} is missing' + .format(file)) with open(files['agent'], 'rb') as f: agent = pickle.load(f) - agent.__environment = environment - agent.__qnet = net - agent.__qnet.load_parameters(files['q_net_params'], agent.__ctx) - agent.__qnet.hybridize() - agent.__qnet(nd.random_normal(shape=((1,) + agent.__state_dim), ctx=agent.__ctx)) - agent.__best_net = copy_net(agent.__qnet, agent.__state_dim, agent.__ctx) - agent.__best_net.load_parameters(files['best_net_params'], agent.__ctx) - agent.__target_qnet = copy_net(agent.__qnet, agent.__state_dim, agent.__ctx) - agent.__target_qnet.load_parameters(files['target_net_params'], agent.__ctx) + agent._environment = environment + + agent._actor = actor + agent._actor.load_parameters(files['actor_net_params'], agent._ctx) + agent._actor.hybridize() + agent._actor(nd.random_normal( + shape=((1,) + agent._state_dim), ctx=agent._ctx)) + + agent._best_net = copy_net(agent._actor, agent._state_dim, agent._ctx) + agent._best_net.load_parameters(files['best_net_params'], agent._ctx) + + agent._actor_target = copy_net( + agent._actor, agent._state_dim, agent._ctx) + agent._actor_target.load_parameters(files['actor_target_net_params']) - agent.__logger = agent.__setup_logging(append=True) - agent.__training_stats.logger = agent.__logger - agent.__logger.info('Agent was retrieved; Training can be continued') + agent._critic = critic + agent._critic.load_parameters(files['critic_net_params'], agent._ctx) + agent._critic.hybridize() + agent._critic( + nd.random_normal(shape=((1,) + agent._state_dim), ctx=agent._ctx), + nd.random_normal(shape=((1,) + agent._action_dim), ctx=agent._ctx)) + + agent._critic_target = copy_net_with_two_inputs( + agent._critic, agent._state_dim, agent._action_dim, agent._ctx) + agent._critic_target.load_parameters(files['critic_target_net_params']) + + agent._logger = ArchLogger.get_logger() + agent._training_stats.logger = ArchLogger.get_logger() + agent._logger.info('Agent was retrieved; Training can be continued') return agent - def __interrupt_training(self): - import pickle - self.__logger.info('Training interrupted; Store state for resuming') - session_dir = os.path.join(self.__output_directory, '.interrupted_session') - if not os.path.exists(session_dir): - os.mkdir(session_dir) + def _save_current_as_best_net(self): + self._best_net = self._copy_actor() - del self.__training_stats.logger - logger = self.__logger - self.__logger = None - self.__environment.close() - self.__environment = None + def get_next_action(self, state): + action = self._actor(nd.array([state], ctx=self._ctx)) + return action[0].asnumpy() - self.__save_net(self.__qnet, 'qnet', session_dir) - self.__qnet = None - self.__save_net(self.__best_net, 'best_net', session_dir) - self.__best_net = None - self.__save_net(self.__target_qnet, 'target_net', session_dir) - self.__target_qnet = None + def save_parameters(self, episode): + self._save_parameters(self._actor, episode=episode) - agent_session_file = os.path.join(session_dir, 'agent.p') + def train(self, episodes=None): + self.save_config_file() + self._logger.info("--- Start DDPG training ---") + episodes = \ + episodes if episodes is not None else self._training_episodes - with open(agent_session_file, 'wb') as f: - pickle.dump(self, f) - self.__logger = logger - logger.info('State successfully stored') + resume = (self._current_episode > 0) + if resume: + self._logger.info("Training session resumed") + self._logger.info( + "Starting from episode {}".format(self._current_episode)) + else: + self._training_stats = DdpgTrainingStats(episodes) - @property - def current_episode(self): - return self.__current_episode + # Initialize target Q' and mu' + self._actor_target = self._copy_actor() + self._critic_target = self._copy_critic() - @property - def environment(self): - return self.__environment + # Initialize l2 loss for critic network + l2_loss = gluon.loss.L2Loss() - def __adjust_optimizer_params(self, optimizer_params): - if 'weight_decay' in optimizer_params: - optimizer_params['wd'] = optimizer_params['weight_decay'] - del optimizer_params['weight_decay'] - if 'learning_rate_decay' in optimizer_params: - min_learning_rate = 1e-8 - if 'learning_rate_minimum' in optimizer_params: - min_learning_rate = optimizer_params['learning_rate_minimum'] - del optimizer_params['learning_rate_minimum'] - optimizer_params['lr_scheduler'] = mx.lr.scheduler.FactorScheduler( - optimizer_params['step_size'], - factor=optimizer_params['learning_rate_decay'], - stop_factor_lr=min_learning_rate) - del optimizer_params['step_size'] - del optimizer_params['learning_rate_decay'] + # Initialize critic and actor trainer + trainer_actor = gluon.Trainer( + self._actor.collect_params(), self._actor_optimizer, + self._actor_optimizer_params) + trainer_critic = gluon.Trainer( + self._critic.collect_params(), self._critic_optimizer, + self._critic_optimizer_params) - return optimizer_params + # For episode=1..n + while self._current_episode < episodes: + # Check interrupt flag + if self._check_interrupt_routine(): + return False - def set_interrupt_flag(self, interrupt): - self.__interrupt_flag = interrupt + # Initialize new episode + step = 0 + episode_reward = 0 + start = time.time() + episode_critic_loss = 0 + episode_actor_loss = 0 + episode_avg_q_value = 0 + training_steps = 0 + # Get initialial observation state s + state = self._environment.reset() + + # For step=1..T + while step < self._max_episode_step: + # Select an action a = mu(s) + N(step) according to current + # actor and exploration noise N according to strategy + action = self._strategy.select_action( + self.get_next_action(state)) + + # Execute action a and observe reward r and next state ns + next_state, reward, terminal, _ = \ + self._environment.step(action) + + self._logger.debug( + 'Applied action {} with reward {}'.format(action, reward)) + + # Store transition (s,a,r,ns) in replay buffer + self._memory.append( + state, action, reward, next_state, terminal) + + if self._do_training(): + # Sample random minibatch of b transitions + # (s_i, a_i, r_i, s_(i+1)) from replay buffer + states, actions, rewards, next_states, terminals =\ + self._sample_from_memory() + + actor_target_actions = self._actor_target(next_states) + critic_target_qvalues = self._critic_target( + next_states, actor_target_actions) + + rewards = rewards.reshape(self._minibatch_size, 1) + terminals = terminals.reshape(self._minibatch_size, 1) + + # y = r_i + discount * Q'(s_(i+1), mu'(s_(i+1))) + y = rewards + (1.0 - terminals) * self._discount_factor\ + * critic_target_qvalues + + # Train the critic network + with autograd.record(): + qvalues = self._critic(states, actions) + critic_loss = l2_loss(qvalues, y) + critic_loss.backward() + trainer_critic.step(self._minibatch_size) + + # Train the actor network + # Temporary critic so that gluon trainer does not mess + # with critic parameters + tmp_critic = self._copy_critic() + with autograd.record(): + actor_qvalues = tmp_critic(states, self._actor(states)) + # For maximizing qvalues we have to multiply with -1 + # as we use a minimizer + actor_loss = -1 * actor_qvalues + actor_loss.backward() + trainer_actor.step(self._minibatch_size) + + # Update target networks: + self._actor_target = self._soft_update( + self._actor, self._actor_target, + self._soft_target_update_rate) + self._critic_target = self._soft_update( + self._critic, self._critic_target, + self._soft_target_update_rate) + + # Update statistics + episode_critic_loss +=\ + np.sum(critic_loss.asnumpy()) / self._minibatch_size + episode_actor_loss +=\ + np.sum(actor_loss.asnumpy()) / self._minibatch_size + episode_avg_q_value +=\ + np.sum(actor_qvalues.asnumpy()) / self._minibatch_size - def __make_output_directory_if_not_exist(self): - assert self.__output_directory - if not os.path.exists(self.__output_directory): - os.makedirs(self.__output_directory) + training_steps += 1 - def __setup_logging(self, append=False): - assert self.__output_directory - assert self.__agent_name + episode_reward += reward + step += 1 + self._total_steps += 1 + state = next_state - output_level = logging.DEBUG if self.__verbose else logging.WARNING - filemode = 'a' if append else 'w' + if terminal: + # Reset the strategy + self._strategy.reset() + break + + # Log the episode results + episode_actor_loss = 0 if training_steps == 0\ + else (episode_actor_loss / training_steps) + episode_critic_loss = 0 if training_steps == 0\ + else (episode_critic_loss / training_steps) + episode_avg_q_value = 0 if training_steps == 0\ + else (episode_avg_q_value / training_steps) + + avg_reward = self._training_stats.log_episode( + self._current_episode, start, training_steps, + episode_actor_loss, episode_critic_loss, episode_avg_q_value, + self._strategy.cur_eps, episode_reward) + + self._do_snapshot_if_in_interval(self._current_episode) + self._strategy.decay(self._current_episode) + + if self._is_target_reached(avg_reward): + self._logger.info( + 'Target score is reached in average; Training is stopped') + break + + self._current_episode += 1 + + self._evaluate() + self.save_parameters(episode=self._current_episode) + self.save_best_network(os.path.join(self._output_directory, 'best')) + self._training_stats.save_stats(self._output_directory) + self._logger.info('--------- Training finished ---------') + return True + + def _make_config_dict(self): + config = super(DdpgAgent, self)._make_config_dict() + config['soft_target_update_rate'] = self._soft_target_update_rate + config['actor_optimizer'] = self._actor_optimizer + config['actor_optimizer_params'] = self._actor_optimizer_params + config['critic_optimizer'] = self._critic_optimizer + config['critic_optimizer_params'] = self._critic_optimizer_params + return config - logformat = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' - dateformat = '%d-%b-%y %H:%M:%S' - formatter = logging.Formatter(fmt=logformat, datefmt=dateformat) + def _soft_update(self, net, target, tau): + net_params = [p.data() for _, p in net.collect_params().items()] + for i, (_, p) in enumerate(target.collect_params().items()): + target_params = p.data() + p.set_data((1.0 - tau) * target_params + tau * net_params[i]) + return target + + def _copy_actor(self): + assert self._actor is not None + assert self._ctx is not None + assert type(self._state_dim) is tuple + return copy_net(self._actor, self._state_dim, ctx=self._ctx) + + def _copy_critic(self): + assert self._critic is not None + assert self._ctx is not None + assert type(self._state_dim) is tuple + assert type(self._action_dim) is tuple + return copy_net_with_two_inputs( + self._critic, self._state_dim, self._action_dim, ctx=self._ctx) + + +class DqnAgent(Agent): + def __init__( + self, + qnet, + environment, + replay_memory_params, + strategy_params, + state_dim, + action_dim, + ctx=None, + discount_factor=.9, + loss_function='euclidean', + optimizer='rmsprop', + optimizer_params={'learning_rate': 0.09}, + training_episodes=50, + start_training=0, + train_interval=1, + use_fix_target=False, + double_dqn=False, + target_update_interval=10, + snapshot_interval=200, + evaluation_samples=100, + agent_name='Dqn_agent', + max_episode_step=99999, + output_directory='model_parameters', + verbose=True, + target_score=None + ): + super(DqnAgent, self).__init__( + environment=environment, replay_memory_params=replay_memory_params, + strategy_params=strategy_params, state_dim=state_dim, + action_dim=action_dim, ctx=ctx, discount_factor=discount_factor, + training_episodes=training_episodes, start_training=start_training, + train_interval=train_interval, + snapshot_interval=snapshot_interval, agent_name=agent_name, + max_episode_step=max_episode_step, + output_directory=output_directory, verbose=verbose, + target_score=target_score, evaluation_samples=evaluation_samples) + + self._qnet = qnet + self._target_update_interval = target_update_interval + self._target_qnet = copy_net( + self._qnet, self._state_dim, ctx=self._ctx) + self._loss_function_str = loss_function + self._loss_function = get_loss_function(loss_function) + self._optimizer = optimizer + self._optimizer_params = optimizer_params + self._double_dqn = double_dqn + self._use_fix_target = use_fix_target - logger = logging.getLogger('DQNAgent') - logger.setLevel(output_level) + # Initialize best network + self._best_net = copy_net(self._qnet, self._state_dim, self._ctx) + self._best_avg_score = -np.infty - stream_handler = logging.StreamHandler(sys.stdout) - stream_handler.setLevel(output_level) - stream_handler.setFormatter(formatter) - logger.addHandler(stream_handler) + self._training_stats = None - if self.__make_logfile: - self.__make_output_directory_if_not_exist() - log_file = os.path.join(self.__output_directory, self.__agent_name + '.log') - file_handler = logging.FileHandler(log_file, mode=filemode) - file_handler.setLevel(output_level) - file_handler.setFormatter(formatter) - logger.addHandler(file_handler) + @classmethod + def resume_from_session(cls, session_dir, net, environment): + import pickle + if not os.path.exists(session_dir): + raise ValueError('Session directory does not exist') - return logger + files = dict() + files['agent'] = os.path.join(session_dir, 'agent.p') + files['best_net_params'] = os.path.join(session_dir, 'best_net.params') + files['q_net_params'] = os.path.join(session_dir, 'qnet.params') + files['target_net_params'] = os.path.join( + session_dir, 'target_net.params') - def __is_target_reached(self, avg_reward): - return self.__target_score is not None\ - and avg_reward > self.__target_score + for file in files.values(): + if not os.path.exists(file): + raise ValueError( + 'Session directory is not complete: {} is missing' + .format(file)) + with open(files['agent'], 'rb') as f: + agent = pickle.load(f) + + agent._environment = environment + agent._qnet = net + agent._qnet.load_parameters(files['q_net_params'], agent._ctx) + agent._qnet.hybridize() + agent._qnet(nd.random_normal( + shape=((1,) + agent._state_dim), ctx=agent._ctx)) + agent._best_net = copy_net(agent._qnet, agent._state_dim, agent._ctx) + agent._best_net.load_parameters(files['best_net_params'], agent._ctx) + agent._target_qnet = copy_net( + agent._qnet, agent._state_dim, agent._ctx) + agent._target_qnet.load_parameters( + files['target_net_params'], agent._ctx) + + agent._logger = ArchLogger.get_logger() + agent._training_stats.logger = agent._logger + agent._logger.info('Agent was retrieved; Training can be continued') + + return agent + + def _make_pickle_ready(self, session_dir): + super(DqnAgent, self)._make_pickle_ready(session_dir) + self._save_net(self._qnet, 'qnet', session_dir) + self._qnet = None + self._save_net(self._target_qnet, 'target_net', session_dir) + self._target_qnet = None def get_q_values(self, state, with_best=False): - return self.get_batch_q_values(nd.array([state], ctx=self.__ctx), with_best=with_best)[0] + return self.get_batch_q_values( + nd.array([state], ctx=self._ctx), with_best=with_best)[0] def get_batch_q_values(self, state_batch, with_best=False): - return self.__best_net(state_batch) if with_best else self.__qnet(state_batch) + return self._best_net(state_batch)\ + if with_best else self._qnet(state_batch) def get_next_action(self, state, with_best=False): q_values = self.get_q_values(state, with_best=with_best) action = q_values.asnumpy().argmax() - return q_values.asnumpy().argmax() + return action - def __sample_from_memory(self): - states, actions, rewards, next_states, terminals\ - = self.__memory.sample(batch_size=self.__minibatch_size) - states = nd.array(states, ctx=self.__ctx) - actions = nd.array(actions, ctx=self.__ctx) - rewards = nd.array(rewards, ctx=self.__ctx) - next_states = nd.array(next_states, ctx=self.__ctx) - terminals = nd.array(terminals, ctx=self.__ctx) - return states, actions, rewards, next_states, terminals - - def __determine_target_q_values(self, states, actions, rewards, next_states, terminals): - if self.__use_fix_target: - q_max_val = self.__target_qnet(next_states) + def __determine_target_q_values( + self, states, actions, rewards, next_states, terminals + ): + if self._use_fix_target: + q_max_val = self._target_qnet(next_states) else: - q_max_val = self.__qnet(next_states) + q_max_val = self._qnet(next_states) - if self.__double_dqn: - q_values_next_states = self.__qnet(next_states) - target_rewards = rewards + nd.choose_element_0index(q_max_val, nd.argmax_channel(q_values_next_states))\ - * (1.0 - terminals) * self.__discount_factor + if self._double_dqn: + q_values_next_states = self._qnet(next_states) + target_rewards = rewards + nd.choose_element_0index( + q_max_val, nd.argmax_channel(q_values_next_states))\ + * (1.0 - terminals) * self._discount_factor else: - target_rewards = rewards + nd.choose_element_0index(q_max_val, nd.argmax_channel(q_max_val))\ - * (1.0 - terminals) * self.__discount_factor + target_rewards = rewards + nd.choose_element_0index( + q_max_val, nd.argmax_channel(q_max_val))\ + * (1.0 - terminals) * self._discount_factor - target_qval = self.__qnet(states) + target_qval = self._qnet(states) for t in range(target_rewards.shape[0]): target_qval[t][actions[t]] = target_rewards[t] return target_qval def __train_q_net_step(self, trainer): - states, actions, rewards, next_states, terminals = self.__sample_from_memory() - target_qval = self.__determine_target_q_values(states, actions, rewards, next_states, terminals) + states, actions, rewards, next_states, terminals =\ + self._sample_from_memory() + target_qval = self.__determine_target_q_values( + states, actions, rewards, next_states, terminals) with autograd.record(): - q_values = self.__qnet(states) - loss = self.__loss_function(q_values, target_qval) + q_values = self._qnet(states) + loss = self._loss_function(q_values, target_qval) loss.backward() - trainer.step(self.__minibatch_size) + trainer.step(self._minibatch_size) return loss - def __do_snapshot_if_in_interval(self, episode): - do_snapshot = (episode != 0 and (episode % self.__snapshot_interval == 0)) - if do_snapshot: - self.save_parameters(episode=episode) - self.__evaluate() - def __do_target_update_if_in_interval(self, total_steps): - do_target_update = (self.__use_fix_target and total_steps % self.__target_update_interval == 0) + do_target_update = ( + self._use_fix_target and + (total_steps % self._target_update_interval == 0)) if do_target_update: - self.__logger.info('Target network is updated after {} steps'.format(total_steps)) - self.__target_qnet = copy_net(self.__qnet, self.__state_dim, self.__ctx) + self._logger.info( + 'Target network is updated after {} steps'.format(total_steps)) + self._target_qnet = copy_net( + self._qnet, self._state_dim, self._ctx) def train(self, episodes=None): - self.__logger.info("--- Start training ---") - trainer = gluon.Trainer(self.__qnet.collect_params(), self.__optimizer, self.__adjust_optimizer_params(self.__optimizer_params)) - episodes = episodes if episodes != None else self.__training_episodes - - resume = (self.__current_episode > 0) + self.save_config_file() + self._logger.info("--- Start training ---") + trainer = gluon.Trainer( + self._qnet.collect_params(), + self._optimizer, + self._adjust_optimizer_params(self._optimizer_params)) + episodes = episodes if episodes is not None\ + else self._training_episodes + + resume = (self._current_episode > 0) if resume: - self.__logger.info("Training session resumed") - self.__logger.info("Starting from episode {}".format(self.__current_episode)) + self._logger.info("Training session resumed") + self._logger.info("Starting from episode {}".format( + self._current_episode)) else: - self.__training_stats = util.TrainingStats(self.__logger, episodes, self.__live_plot) + self._training_stats = DqnTrainingStats(episodes) - # Implementation Deep Q Learning described by Mnih et. al. in Playing Atari with Deep Reinforcement Learning - while self.__current_episode < episodes: - # Check interrupt flag - if self.__interrupt_flag: - self.__interrupt_flag = False - self.__interrupt_training() + # Implementation Deep Q Learning described by + # Mnih et. al. in Playing Atari with Deep Reinforcement Learning + while self._current_episode < episodes: + if self._check_interrupt_routine(): return False step = 0 episode_reward = 0 start = time.time() - state = self.__environment.reset() + state = self._environment.reset() episode_loss = 0 training_steps = 0 - while step < self.__max_episode_step: - #1. Choose an action based on current game state and policy - q_values = self.__qnet(nd.array([state], ctx=self.__ctx)) - action = self.__policy.select_action(q_values[0]) + while step < self._max_episode_step: + # 1. Choose an action based on current game state and policy + q_values = self._qnet(nd.array([state], ctx=self._ctx)) + action = self._strategy.select_action(q_values[0]) - #2. Play the game for a single step - next_state, reward, terminal, _ = self.__environment.step(action) + # 2. Play the game for a single step + next_state, reward, terminal, _ =\ + self._environment.step(action) - #3. Store transition in replay memory - self.__memory.append(state, action, reward, next_state, terminal) + # 3. Store transition in replay memory + self._memory.append( + state, action, reward, next_state, terminal) - #4. Train the network if in interval - do_training = (self.__total_steps % self.__train_interval == 0\ - and self.__memory.is_sample_possible(self.__minibatch_size)) - if do_training: + # 4. Train the network if in interval + if self._do_training(): loss = self.__train_q_net_step(trainer) - loss_sum = sum(loss).asnumpy()[0] - episode_loss += float(loss_sum)/float(self.__minibatch_size) training_steps += 1 + episode_loss +=\ + np.sum(loss.asnumpy()) / self._minibatch_size # Update target network if in interval - self.__do_target_update_if_in_interval(self.__total_steps) + self.__do_target_update_if_in_interval(self._total_steps) step += 1 - self.__total_steps += 1 + self._total_steps += 1 episode_reward += reward state = next_state if terminal: - episode_loss = episode_loss if training_steps > 0 else None - _, _, avg_reward = self.__training_stats.log_episode(self.__current_episode, start, training_steps, - episode_loss, self.__policy.cur_eps, episode_reward) + self._strategy.reset() break - self.__do_snapshot_if_in_interval(self.__current_episode) - self.__policy.decay() - - if self.__is_target_reached(avg_reward): - self.__logger.info('Target score is reached in average; Training is stopped') - break - - self.__current_episode += 1 + self._do_snapshot_if_in_interval(self._current_episode) - self.__evaluate() - training_stats_file = os.path.join(self.__output_directory, 'training_stats.pdf') - self.__training_stats.save_stats(training_stats_file) - self.__logger.info('--------- Training finished ---------') - return True - - def __save_net(self, net, filename, filedir=None): - filedir = self.__output_directory if filedir is None else filedir - filename = os.path.join(filedir, filename + '.params') - net.save_parameters(filename) + episode_loss = (episode_loss / training_steps)\ + if training_steps > 0 else 0 + avg_reward = self._training_stats.log_episode( + self._current_episode, start, training_steps, + episode_loss, self._strategy.cur_eps, episode_reward) + self._strategy.decay(self._current_episode) - def save_parameters(self, episode=None, filename='dqn-agent-params'): - assert self.__output_directory - self.__make_output_directory_if_not_exist() - - if(episode != None): - self.__logger.info('Saving model parameters after episode %d' % episode) - filename = filename + '-ep{}'.format(episode) - else: - self.__logger.info('Saving model parameters') - self.__save_net(self.__qnet, filename) - - def evaluate(self, target=None, sample_games=100, verbose=True): - target = self.__target_score if target is None else target - if target: - target_achieved = 0 - total_reward = 0 - - for g in range(sample_games): - state = self.__environment.reset() - step = 0 - game_reward = 0 - while step < self.__max_episode_step: - action = self.get_next_action(state) - state, reward, terminal, _ = self.__environment.step(action) - game_reward += reward - - if terminal: - if verbose: - info = 'Game %d: Reward %f' % (g,game_reward) - self.__logger.debug(info) - if target: - if game_reward >= target: - target_achieved += 1 - total_reward += game_reward - break - - step += 1 - - avg_reward = float(total_reward)/float(sample_games) - info = 'Avg. Reward: %f' % avg_reward - if target: - target_achieved_ratio = int((float(target_achieved)/float(sample_games))*100) - info += '; Target Achieved in %d%% of games' % (target_achieved_ratio) - - if verbose: - self.__logger.info(info) - return avg_reward - - def __evaluate(self, verbose=True): - sample_games = 100 - avg_reward = self.evaluate(sample_games=sample_games, verbose=False) - info = 'Evaluation -> Average Reward in {} games: {}'.format(sample_games, avg_reward) - - if self.__best_avg_score is None or self.__best_avg_score <= avg_reward: - self.__best_net = copy_net(self.__qnet, self.__state_dim, self.__ctx) - self.__best_avg_score = avg_reward - info += ' (NEW BEST)' - - if verbose: - self.__logger.info(info) - - - - def play(self, update_frame=1, with_best=False): - step = 0 - state = self.__environment.reset() - total_reward = 0 - while step < self.__max_episode_step: - action = self.get_next_action(state, with_best=with_best) - state, reward, terminal, _ = self.__environment.step(action) - total_reward += reward - do_update_frame = (step % update_frame == 0) - if do_update_frame: - self.__environment.render() - time.sleep(.100) - - if terminal: + if self._is_target_reached(avg_reward): + self._logger.info( + 'Target score is reached in average; Training is stopped') break - step += 1 - return total_reward + self._current_episode += 1 - def save_best_network(self, path, epoch=0): - self.__logger.info('Saving best network with average reward of {}'.format(self.__best_avg_score)) - self.__best_net.export(path, epoch=epoch) + self._evaluate() + self.save_parameters(episode=self._current_episode) + self.save_best_network(os.path.join(self._output_directory, 'best')) + self._training_stats.save_stats(self._output_directory) + self._logger.info('--------- Training finished ---------') + return True - def __make_config_dict(self): - config = dict() - config['discount_factor'] = self.__discount_factor - config['optimizer'] = self.__optimizer - config['optimizer_params'] = self.__optimizer_params - config['policy_params'] = self.__policy_params - config['replay_memory_params'] = self.__replay_memory_params - config['loss_function'] = self.__loss_function_str - config['optimizer'] = self.__optimizer - config['training_episodes'] = self.__training_episodes - config['train_interval'] = self.__train_interval - config['use_fix_target'] = self.__use_fix_target - config['double_dqn'] = self.__double_dqn - config['target_update_interval'] = self.__target_update_interval - config['snapshot_interval']= self.__snapshot_interval - config['agent_name'] = self.__agent_name - config['max_episode_step'] = self.__max_episode_step - config['output_directory'] = self.__user_given_directory - config['verbose'] = self.__verbose - config['live_plot'] = self.__live_plot - config['make_logfile'] = self.__make_logfile - config['target_score'] = self.__target_score + def _make_config_dict(self): + config = super(DqnAgent, self)._make_config_dict() + config['optimizer'] = self._optimizer + config['optimizer_params'] = self._optimizer_params + config['loss_function'] = self._loss_function_str + config['use_fix_target'] = self._use_fix_target + config['double_dqn'] = self._double_dqn + config['target_update_interval'] = self._target_update_interval return config - def save_config_file(self): - import json - self.__make_output_directory_if_not_exist() - filename = os.path.join(self.__output_directory, 'config.json') - config = self.__make_config_dict() - with open(filename, mode='w') as fp: - json.dump(config, fp, indent=4) \ No newline at end of file + def save_parameters(self, episode): + self._save_parameters(self._qnet, episode=episode) + + def _save_current_as_best_net(self): + self._best_net = copy_net( + self._qnet, (1,) + self._state_dim, ctx=self._ctx) diff --git a/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/cnnarch_logger.py b/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/cnnarch_logger.py new file mode 100644 index 0000000..19f36b1 --- /dev/null +++ b/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/cnnarch_logger.py @@ -0,0 +1,95 @@ +import logging +import sys +import os +import util + + +class ArchLogger(object): + _logger = None + + __output_level = logging.INFO + __logger_name = 'agent' + __output_directory = '.' + __append = True + __logformat = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + __dateformat = '%d-%b-%y %H:%M:%S' + + INFO = logging.INFO + DEBUG = logging.DEBUG + + @staticmethod + def set_output_level(output_level): + assert output_level is not None + ArchLogger.__output_level = output_level + + @staticmethod + def set_logger_name(logger_name): + assert logger_name is not None + ArchLogger.__logger_name = logger_name + + @staticmethod + def set_output_directory(output_directory): + assert output_directory is not None + ArchLogger.__output_directory = output_directory + + @staticmethod + def set_append(append): + assert append is not None + ArchLogger.__append = append + + @staticmethod + def set_log_format(logformat, dateformat): + assert logformat is not None + assert dateformat is not None + ArchLogger.__logformat = logformat + ArchLogger.__dateformat = dateformat + + @staticmethod + def init_logger(make_log_file=True): + assert ArchLogger._logger is None, 'Logger init already called' + filemode = 'a' if ArchLogger.__append else 'w' + formatter = logging.Formatter( + fmt=ArchLogger.__logformat, datefmt=ArchLogger.__dateformat) + + logger = logging.getLogger(ArchLogger.__logger_name) + logger.propagate = False + + if not logger.handlers: + logger.setLevel(ArchLogger.__output_level) + stream_handler = logging.StreamHandler(sys.stdout) + stream_handler.setLevel(ArchLogger.__output_level) + stream_handler.setFormatter(formatter) + logger.addHandler(stream_handler) + + if make_log_file: + util.make_directory_if_not_exist(ArchLogger.__output_directory) + log_file = os.path.join( + ArchLogger.__output_directory, + ArchLogger.__logger_name + '.log') + file_handler = logging.FileHandler(log_file, mode=filemode) + file_handler.setLevel(ArchLogger.__output_level) + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + ArchLogger._logger = logger + + @staticmethod + def get_logger(): + if ArchLogger._logger is None: + ArchLogger.init_logger() + assert ArchLogger._logger is not None + return ArchLogger._logger + +if __name__ == "__main__": + print('=== Test logger ===') + ArchLogger.set_logger_name('TestLogger') + ArchLogger.set_output_directory('test_log') + ArchLogger.init_logger() + logger = ArchLogger.get_logger() + logger.warning('This is a warning') + logger.debug('This is a debug information, which you should not see') + logger.info('This is a normal information') + assert os.path.exists('test_log')\ + and os.path.isfile(os.path.join('test_log', 'TestLogger.log')),\ + 'Test failed: No logfile exists' + import shutil + shutil.rmtree('test_log') \ No newline at end of file diff --git a/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/environment.py b/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/environment.py index 405f1b0..cd86ea4 100644 --- a/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/environment.py +++ b/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/environment.py @@ -10,9 +10,11 @@ class RewardFunction(object): self.__reward_wrapper.init() def reward(self, state, terminal): + s = state.astype('double') + t = bool(terminal) inp = reward_rewardFunction_executor.reward_rewardFunction_input() - inp.state = state - inp.isTerminal = terminal + inp.state = s + inp.isTerminal = t output = self.__reward_wrapper.execute(inp) return output.reward @@ -40,7 +42,7 @@ import rospy import thread import numpy as np import time -from std_msgs.msg import Float32MultiArray, Bool, Int32 +from std_msgs.msg import Float32MultiArray, Bool, Int32, MultiArrayDimension, Float32 class RosEnvironment(Environment): def __init__(self, @@ -50,15 +52,13 @@ class RosEnvironment(Environment): action_topic='action', reset_topic='reset', terminal_state_topic='terminal', - meta_topic='meta', - greeting_topic='greeting'): + reward_topic='reward'): super(RosEnvironment, self).__init__() self.__timeout_in_s = timeout_in_s - self.__waiting_for_state_update = False self.__waiting_for_terminal_update = False self.__last_received_state = 0 - self.__last_received_terminal = 0 + self.__last_received_terminal = True rospy.loginfo("Initialize node {0}".format(ros_node_name)) @@ -111,7 +111,8 @@ class RosEnvironment(Environment): def __wait_for_new_state(self, publisher, msg): time_of_timeout = time.time() + self.__timeout_in_s timeout_counter = 0 - while(self.__waiting_for_state_update or self.__waiting_for_terminal_update): + while(self.__waiting_for_state_update + or self.__waiting_for_terminal_update): is_timeout = (time.time() > time_of_timeout) if (is_timeout): if timeout_counter < 3: @@ -127,9 +128,8 @@ class RosEnvironment(Environment): def close(self): rospy.signal_shutdown('Program ended!') - def __state_callback(self, data): - self.__last_received_state = np.array(data.data, dtype='double') + self.__last_received_state = np.array(data.data, dtype='float32') rospy.logdebug('Received state: {}'.format(self.__last_received_state)) self.__waiting_for_state_update = False diff --git a/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/replay_memory.py b/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/replay_memory.py index e66cd93..d22147a 100644 --- a/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/replay_memory.py +++ b/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/replay_memory.py @@ -1,58 +1,90 @@ import numpy as np + class ReplayMemoryBuilder(object): def __init__(self): self.__supported_methods = ['online', 'buffer', 'combined'] - def build_by_params(self, + def build_by_params( + self, state_dim, method='online', state_dtype='float32', + action_dim=(1,), action_dtype='uint8', rewards_dtype='float32', memory_size=1000, - sample_size=32): + sample_size=32 + ): assert state_dim is not None + assert action_dim is not None assert method in self.__supported_methods if method == 'online': - return self.build_online_memory(state_dim=state_dim, state_dtype=state_dtype, - action_dtype=action_dtype, rewards_dtype=rewards_dtype) + return self.build_online_memory( + state_dim=state_dim, state_dtype=state_dtype, + action_dtype=action_dtype, action_dim=action_dim, + rewards_dtype=rewards_dtype) else: assert memory_size is not None and memory_size > 0 assert sample_size is not None and sample_size > 0 if method == 'buffer': - return self.build_buffered_memory(state_dim=state_dim, sample_size=sample_size, - memory_size=memory_size, state_dtype=state_dtype, action_dtype=action_dtype, + return self.build_buffered_memory( + state_dim=state_dim, sample_size=sample_size, + memory_size=memory_size, state_dtype=state_dtype, + action_dim=action_dim, action_dtype=action_dtype, rewards_dtype=rewards_dtype) else: - return self.build_combined_memory(state_dim=state_dim, sample_size=sample_size, - memory_size=memory_size, state_dtype=state_dtype, action_dtype=action_dtype, + return self.build_combined_memory( + state_dim=state_dim, sample_size=sample_size, + memory_size=memory_size, state_dtype=state_dtype, + action_dim=action_dim, action_dtype=action_dtype, rewards_dtype=rewards_dtype) - def build_buffered_memory(self, state_dim, memory_size=1000, sample_size=1, state_dtype='float32', - action_dtype='uint8', rewards_dtype='float32'): + def build_buffered_memory( + self, state_dim, memory_size, sample_size, state_dtype, action_dim, + action_dtype, rewards_dtype + ): assert memory_size > 0 assert sample_size > 0 - return ReplayMemory(state_dim, size=memory_size, sample_size=sample_size, - state_dtype=state_dtype, action_dtype=action_dtype, rewards_dtype=rewards_dtype) - - def build_combined_memory(self, state_dim, memory_size=1000, sample_size=1, state_dtype='float32', - action_dtype='uint8', rewards_dtype='float32'): + return ReplayMemory( + state_dim, size=memory_size, sample_size=sample_size, + state_dtype=state_dtype, action_dim=action_dim, + action_dtype=action_dtype, rewards_dtype=rewards_dtype) + + def build_combined_memory( + self, state_dim, memory_size, sample_size, state_dtype, action_dim, + action_dtype, rewards_dtype + ): assert memory_size > 0 assert sample_size > 0 - return CombinedReplayMemory(state_dim, size=memory_size, sample_size=sample_size, - state_dtype=state_dtype, action_dtype=action_dtype, rewards_dtype=rewards_dtype) + return CombinedReplayMemory( + state_dim, size=memory_size, sample_size=sample_size, + state_dtype=state_dtype, action_dim=action_dim, + action_dtype=action_dtype, rewards_dtype=rewards_dtype) + + def build_online_memory( + self, state_dim, state_dtype, action_dtype, action_dim, rewards_dtype + ): + return OnlineReplayMemory( + state_dim, state_dtype=state_dtype, action_dim=action_dim, + action_dtype=action_dtype, rewards_dtype=rewards_dtype) - def build_online_memory(self, state_dim, state_dtype='float32', action_dtype='uint8', - rewards_dtype='float32'): - return OnlineReplayMemory(state_dim, state_dtype=state_dtype, action_dtype=action_dtype, - rewards_dtype=rewards_dtype) class ReplayMemory(object): - def __init__(self, state_dim, sample_size, size=1000, state_dtype='uint8', action_dtype='uint8', rewards_dtype='float32'): + def __init__( + self, + state_dim, + sample_size, + size=1000, + action_dim=(1,), + state_dtype='float32', + action_dtype='uint8', + rewards_dtype='float32' + ): assert size > 0, "Size must be greater than zero" assert type(state_dim) is tuple, "State dimension must be a tuple" + assert type(action_dim) is tuple, "Action dimension must be a tuple" assert sample_size > 0 self._size = size self._sample_size = sample_size @@ -60,12 +92,15 @@ class ReplayMemory(object): self._pointer = 0 self._state_dim = state_dim self._state_dtype = state_dtype + self._action_dim = action_dim self._action_dtype = action_dtype self._rewards_dtype = rewards_dtype self._states = np.zeros((self._size,) + state_dim, dtype=state_dtype) - self._actions = np.array([0] * self._size, dtype=action_dtype) + self._actions = np.zeros( + (self._size,) + action_dim, dtype=action_dtype) self._rewards = np.array([0] * self._size, dtype=rewards_dtype) - self._next_states = np.zeros((self._size,) + state_dim, dtype=state_dtype) + self._next_states = np.zeros( + (self._size,) + state_dim, dtype=state_dtype) self._terminals = np.array([0] * self._size, dtype='bool') @property @@ -93,17 +128,23 @@ class ReplayMemory(object): self._terminals[index] def is_sample_possible(self, batch_size=None): - batch_size = batch_size if batch_size is not None else self._sample_size + batch_size = batch_size if batch_size is not None\ + else self._sample_size return self._cur_size >= batch_size def sample(self, batch_size=None): - batch_size = batch_size if batch_size is not None else self._sample_size - assert self._cur_size >= batch_size, "Size of replay memory must be larger than batch size" - i=0 - states = np.zeros((batch_size,)+self._state_dim, dtype=self._state_dtype) - actions = np.zeros(batch_size, dtype=self._action_dtype) + batch_size = batch_size if batch_size is not None\ + else self._sample_size + assert self._cur_size >= batch_size,\ + "Size of replay memory must be larger than batch size" + i = 0 + states = np.zeros(( + batch_size,)+self._state_dim, dtype=self._state_dtype) + actions = np.zeros( + (batch_size,)+self._action_dim, dtype=self._action_dtype) rewards = np.zeros(batch_size, dtype=self._rewards_dtype) - next_states = np.zeros((batch_size,)+self._state_dim, dtype=self._state_dtype) + next_states = np.zeros( + (batch_size,)+self._state_dim, dtype=self._state_dtype) terminals = np.zeros(batch_size, dtype='bool') while i < batch_size: @@ -119,25 +160,35 @@ class ReplayMemory(object): class OnlineReplayMemory(ReplayMemory): - def __init__(self, state_dim, state_dtype='float32', action_dtype='uint8', rewards_dtype='float32'): - super(OnlineReplayMemory, self).__init__(state_dim, sample_size=1, size=1, - state_dtype=state_dtype, action_dtype=action_dtype, rewards_dtype=rewards_dtype) + def __init__( + self, state_dim, state_dtype='float32', action_dim=(1,), + action_dtype='uint8', rewards_dtype='float32' + ): + super(OnlineReplayMemory, self).__init__( + state_dim, sample_size=1, size=1, state_dtype=state_dtype, + action_dim=action_dim, action_dtype=action_dtype, + rewards_dtype=rewards_dtype) class CombinedReplayMemory(ReplayMemory): - def __init__(self, state_dim, sample_size, size=1000, - state_dtype='uint8', action_dtype='uint8', rewards_dtype='float32'): - super(CombinedReplayMemory, self).__init__(state_dim, sample_size=(sample_size - 1), size=size, - state_dtype=state_dtype, action_dtype=action_dtype, rewards_dtype=rewards_dtype) + def __init__( + self, state_dim, sample_size, size=1000, state_dtype='float32', + action_dim=(1,), action_dtype='uint8', rewards_dtype='float32' + ): + super(CombinedReplayMemory, self).__init__( + state_dim=state_dim, sample_size=(sample_size - 1), size=size, + state_dtype=state_dtype, action_dim=action_dim, + action_dtype=action_dtype, rewards_dtype=rewards_dtype) self._last_state = np.zeros((1,) + state_dim, dtype=state_dtype) - self._last_action = np.array([0], dtype=action_dtype) + self._last_action = np.array((1,) + action_dim, dtype=action_dtype) self._last_reward = np.array([0], dtype=rewards_dtype) self._last_next_state = np.zeros((1,) + state_dim, dtype=state_dtype) self._last_terminal = np.array([0], dtype='bool') def append(self, state, action, reward, next_state, terminal): - super(CombinedReplayMemory, self).append(state, action, reward, next_state, terminal) + super(CombinedReplayMemory, self).append( + state, action, reward, next_state, terminal) self._last_state = state self._last_action = action self._last_reward = reward @@ -145,8 +196,10 @@ class CombinedReplayMemory(ReplayMemory): self._last_terminal = terminal def sample(self, batch_size=None): - batch_size = (batch_size-1) if batch_size is not None else self._sample_size - states, actions, rewards, next_states, terminals = super(CombinedReplayMemory, self).sample(batch_size=batch_size) + batch_size = (batch_size-1) if batch_size is not None\ + else self._sample_size + states, actions, rewards, next_states, terminals = super( + CombinedReplayMemory, self).sample(batch_size=batch_size) states = np.append(states, [self._last_state], axis=0) actions = np.append(actions, [self._last_action], axis=0) rewards = np.append(rewards, [self._last_reward], axis=0) diff --git a/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/strategy.py b/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/strategy.py new file mode 100644 index 0000000..4c47adc --- /dev/null +++ b/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/strategy.py @@ -0,0 +1,172 @@ +import numpy as np + + +class StrategyBuilder(object): + def __init__(self): + pass + + def build_by_params( + self, + method='epsgreedy', + epsilon=0.5, + min_epsilon=0.05, + epsilon_decay_method='no', + epsilon_decay=0.0, + epsilon_decay_start=0, + action_dim=None, + action_low=None, + action_high=None, + mu=0.0, + theta=0.5, + sigma=0.3 + ): + + if epsilon_decay_method == 'linear': + decay = LinearDecay( + eps_decay=epsilon_decay, min_eps=min_epsilon, + decay_start=epsilon_decay_start) + else: + decay = NoDecay() + + if method == 'epsgreedy': + assert action_dim is not None + assert len(action_dim) == 1 + return EpsilonGreedyStrategy( + eps=epsilon, number_of_actions=action_dim[0], + decay_method=decay) + elif method == 'ornstein_uhlenbeck': + assert action_dim is not None + assert action_low is not None + assert action_high is not None + assert mu is not None + assert theta is not None + assert sigma is not None + return OrnsteinUhlenbeckStrategy( + action_dim, action_low, action_high, epsilon, mu, theta, + sigma, decay) + else: + assert action_dim is not None + assert len(action_dim) == 1 + return GreedyStrategy() + + +class BaseDecay(object): + def __init__(self): + pass + + def decay(self, *args): + raise NotImplementedError + + def __call__(self, *args): + return self.decay(*args) + + +class NoDecay(BaseDecay): + def __init__(self): + super(NoDecay, self).__init__() + + def decay(self, cur_eps, episode): + return cur_eps + + +class LinearDecay(BaseDecay): + def __init__(self, eps_decay, min_eps=0, decay_start=0): + super(LinearDecay, self).__init__() + self.eps_decay = eps_decay + self.min_eps = min_eps + self.decay_start = decay_start + + def decay(self, cur_eps, episode): + if episode < self.decay_start: + return cur_eps + else: + return max(cur_eps - self.eps_decay, self.min_eps) + + +class BaseStrategy(object): + def __init__(self, decay_method): + self._decay_method = decay_method + + def select_action(self, values, decay_method): + raise NotImplementedError + + def decay(self, episode): + self.cur_eps = self._decay_method.decay(self.cur_eps, episode) + + def reset(self): + pass + + +class EpsilonGreedyStrategy(BaseStrategy): + def __init__(self, eps, number_of_actions, decay_method): + super(EpsilonGreedyStrategy, self).__init__(decay_method) + self.eps = eps + self.cur_eps = eps + self.__number_of_actions = number_of_actions + + def select_action(self, values): + do_exploration = (np.random.rand() < self.cur_eps) + if do_exploration: + action = np.random.randint(low=0, high=self.__number_of_actions) + else: + action = values.asnumpy().argmax() + return action + + +class GreedyStrategy(BaseStrategy): + def __init__(self): + super(GreedyStrategy, self).__init__(None) + + def select_action(self, values): + return values.asnumpy().argmax() + + def decay(self): + pass + + +class OrnsteinUhlenbeckStrategy(BaseStrategy): + """ + Ornstein-Uhlenbeck process: dxt = theta * (mu - xt) * dt + sigma * dWt + where Wt denotes the Wiener process. + """ + def __init__( + self, + action_dim, + action_low, + action_high, + eps, + mu=0.0, + theta=.15, + sigma=.3, + decay=NoDecay() + ): + super(OrnsteinUhlenbeckStrategy, self).__init__(decay) + self.eps = eps + self.cur_eps = eps + + self._decay_method = decay + + self._action_dim = action_dim + self._action_low = action_low + self._action_high = action_high + + self._mu = np.array(mu) + self._theta = np.array(theta) + self._sigma = np.array(sigma) + + self.state = np.ones(self._action_dim) * self._mu + + def _evolve_state(self): + x = self.state + dx = self._theta * (self._mu - x)\ + + self._sigma * np.random.randn(len(x)) + self.state = x + dx + return self.state + + def reset(self): + self.state = np.ones(self._action_dim) * self._mu + + def select_action(self, values): + noise = self._evolve_state() + action = values + (self.cur_eps * noise) + return np.clip(action, self._action_low, self._action_high) diff --git a/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/util.py b/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/util.py index 5857893..a1da1ce 100644 --- a/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/util.py +++ b/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/util.py @@ -5,9 +5,9 @@ import matplotlib.pyplot as plt from matplotlib import style import time import os -import mxnet +import mxnet as mx from mxnet import gluon, nd - +import cnnarch_logger LOSS_FUNCTIONS = { 'l1': gluon.loss.L1Loss(), @@ -16,17 +16,46 @@ LOSS_FUNCTIONS = { 'softmax_cross_entropy': gluon.loss.SoftmaxCrossEntropyLoss(), 'sigmoid_cross_entropy': gluon.loss.SigmoidBinaryCrossEntropyLoss()} -def copy_net(net, input_state_dim, ctx, tmp_filename='tmp.params'): + +def make_directory_if_not_exist(directory): + if not os.path.exists(directory): + os.makedirs(directory) + + +def copy_net(net, input_state_dim, ctx): + assert isinstance(net, gluon.HybridBlock) + assert type(net.__class__) is type + + net2 = net.__class__() + net2.collect_params().initialize(mx.init.Zero(), ctx=ctx) + net2.hybridize() + net2(mx.nd.ones((1,) + input_state_dim, ctx=ctx)) + + params_of_net = [p.data() for _, p in net.collect_params().items()] + for i, (_, p) in enumerate(net2.collect_params().items()): + p.set_data(params_of_net[i]) + + return net2 + + +def copy_net_with_two_inputs(net, input_state_dim1, input_state_dim2, ctx): assert isinstance(net, gluon.HybridBlock) assert type(net.__class__) is type - net.save_parameters(tmp_filename) + net2 = net.__class__() - net2.load_parameters(tmp_filename, ctx=ctx) - os.remove(tmp_filename) + net2.collect_params().initialize(mx.init.Zero(), ctx=ctx) net2.hybridize() - net2(nd.ones((1,) + input_state_dim, ctx=ctx)) + net2( + nd.ones((1,) + input_state_dim1, ctx=ctx), + nd.ones((1,) + input_state_dim2, ctx=ctx)) + + params_of_net = [p.data() for _, p in net.collect_params().items()] + for i, (_, p) in enumerate(net2.collect_params().items()): + p.set_data(params_of_net[i]) + return net2 + def get_loss_function(loss_function_name): if loss_function_name not in LOSS_FUNCTIONS: raise ValueError('Loss function does not exist') @@ -52,89 +81,196 @@ class AgentSignalHandler(object): sys.exit(1) style.use('fivethirtyeight') + + class TrainingStats(object): - def __init__(self, logger, max_episodes, live_plot=True): - self.__logger = logger - self.__max_episodes = max_episodes - self.__all_avg_loss = np.zeros((max_episodes,)) - self.__all_total_rewards = np.zeros((max_episodes,)) - self.__all_eps = np.zeros((max_episodes,)) - self.__all_time = np.zeros((max_episodes,)) - self.__all_mean_reward_last_100_episodes = np.zeros((max_episodes,)) - self.__live_plot = live_plot + def __init__(self, max_episodes): + self._logger = cnnarch_logger.ArchLogger.get_logger() + self._max_episodes = max_episodes + self._all_total_rewards = np.zeros((max_episodes,)) + self._all_eps = np.zeros((max_episodes,)) + self._all_time = np.zeros((max_episodes,)) + self._all_mean_reward_last_100_episodes = np.zeros((max_episodes,)) @property def logger(self): - return self.__logger + return self._logger @logger.setter def logger(self, logger): - self.__logger = logger + self._logger = logger @logger.deleter def logger(self): - self.__logger = None - - def add_avg_loss(self, episode, avg_loss): - self.__all_avg_loss[episode] = avg_loss + self._logger = None def add_total_reward(self, episode, total_reward): - self.__all_total_rewards[episode] = total_reward + self._all_total_rewards[episode] = total_reward def add_eps(self, episode, eps): - self.__all_eps[episode] = eps + self._all_eps[episode] = eps def add_time(self, episode, time): - self.__all_time[episode] = time + self._all_time[episode] = time def add_mean_reward_last_100(self, episode, mean_reward): - self.__all_mean_reward_last_100_episodes[episode] = mean_reward + self._all_mean_reward_last_100_episodes[episode] = mean_reward + + def log_episode(self, *args): + raise NotImplementedError + + def mean_of_reward(self, cur_episode, last=100): + if cur_episode > 0: + reward_last_100 =\ + self._all_total_rewards[max(0, cur_episode-last):cur_episode] + return np.mean(reward_last_100) + else: + return self._all_total_rewards[0] - def log_episode(self, episode, start_time, training_steps, loss, eps, reward): + def save(self, path): + np.save(os.path.join(path, 'total_rewards'), self._all_total_rewards) + np.save(os.path.join(path, 'eps'), self._all_eps) + np.save(os.path.join(path, 'time'), self._all_time) + np.save( + os.path.join(path, 'mean_reward'), + self._all_mean_reward_last_100_episodes) + + def _log_episode(self, episode, start_time, training_steps, eps, reward): self.add_eps(episode, eps) self.add_total_reward(episode, reward) end = time.time() - if training_steps == 0: - avg_loss = 0 - else: - avg_loss = float(loss)/float(training_steps) - mean_reward_last_100 = self.mean_of_reward(episode, last=100) - time_elapsed = end - start_time - info = "Episode: %d, Total Reward: %.3f, Avg. Reward Last 100 Episodes: %.3f, Avg Loss: %.3f, Time: %.3f, Training Steps: %d, Eps: %.3f"\ - % (episode, reward, mean_reward_last_100, avg_loss, time_elapsed, training_steps, eps) - self.__logger.info(info) - self.add_avg_loss(episode, avg_loss) self.add_time(episode, time_elapsed) self.add_mean_reward_last_100(episode, mean_reward_last_100) + return ('Episode: %d, Total Reward: %.3f, ' + 'Avg. Reward Last 100 Episodes: %.3f, {}, ' + 'Time: %.3f, Training Steps: %d, Eps: %.3f') % ( + episode, reward, mean_reward_last_100, time_elapsed, + training_steps, eps), mean_reward_last_100 - return avg_loss, time_elapsed, mean_reward_last_100 - def mean_of_reward(self, cur_episode, last=100): - if cur_episode > 0: - reward_last_100 = self.__all_total_rewards[max(0, cur_episode-last):cur_episode] - return np.mean(reward_last_100) - else: - return self.__all_total_rewards[0] +class DqnTrainingStats(TrainingStats): + def __init__(self, max_episodes): + super(DqnTrainingStats, self).__init__(max_episodes) + self._all_avg_loss = np.zeros((max_episodes,)) + + def add_avg_loss(self, episode, avg_loss): + self._all_avg_loss[episode] = avg_loss + + def log_episode( + self, episode, start_time, training_steps, avg_loss, eps, reward + ): + self.add_avg_loss(episode, avg_loss) + + info, avg_reward = self._log_episode( + episode, start_time, training_steps, eps, reward) + info = info.format(('Avg. Loss: %.3f') % (avg_loss)) - def save_stats(self, file): - fig = plt.figure(figsize=(20,20)) + self._logger.info(info) + return avg_reward + + def save_stats(self, path): + fig = plt.figure(figsize=(20, 20)) sub_rewards = fig.add_subplot(221) sub_rewards.set_title('Total Rewards per episode') - sub_rewards.plot(np.arange(self.__max_episodes), self.__all_total_rewards) + sub_rewards.plot( + np.arange(self._max_episodes), self._all_total_rewards) sub_loss = fig.add_subplot(222) sub_loss.set_title('Avg. Loss per episode') - sub_loss.plot(np.arange(self.__max_episodes), self.__all_avg_loss) + sub_loss.plot(np.arange(self._max_episodes), self._all_avg_loss) sub_eps = fig.add_subplot(223) sub_eps.set_title('Epsilon per episode') - sub_eps.plot(np.arange(self.__max_episodes), self.__all_eps) + sub_eps.plot(np.arange(self._max_episodes), self._all_eps) sub_rewards = fig.add_subplot(224) sub_rewards.set_title('Avg. mean reward of last 100 episodes') - sub_rewards.plot(np.arange(self.__max_episodes), self.__all_mean_reward_last_100_episodes) + sub_rewards.plot(np.arange(self._max_episodes), + self._all_mean_reward_last_100_episodes) + + self.save(path) + plt.savefig(os.path.join(path, 'stats.pdf')) + + def save(self, path): + super(DqnTrainingStats, self).save(path) + np.save(os.path.join(path, 'avg_loss'), self._all_avg_loss) + + +class DdpgTrainingStats(TrainingStats): + def __init__(self, max_episodes): + super(DdpgTrainingStats, self).__init__(max_episodes) + self._all_avg_critic_loss = np.zeros((max_episodes,)) + self._all_avg_actor_loss = np.zeros((max_episodes,)) + self._all_avg_qvalues = np.zeros((max_episodes,)) + + def add_avg_critic_loss(self, episode, avg_critic_loss): + self._all_avg_critic_loss[episode] = avg_critic_loss + + def add_avg_actor_loss(self, episode, avg_actor_loss): + self._all_avg_actor_loss[episode] = avg_actor_loss + + def add_avg_qvalues(self, episode, avg_qvalues): + self._all_avg_qvalues[episode] = avg_qvalues + + def log_episode( + self, episode, start_time, training_steps, actor_loss, + critic_loss, qvalues, eps, reward + ): + self.add_avg_actor_loss(episode, actor_loss) + self.add_avg_critic_loss(episode, critic_loss) + self.add_avg_qvalues(episode, qvalues) + + info, avg_reward = self._log_episode( + episode, start_time, training_steps, eps, reward) + info = info.format(( + 'Avg. Actor Loss: %.3f ' + 'Avg. Critic Loss: %.3f ' + 'Avg. Q-Values: %.3f') % (actor_loss, critic_loss, qvalues)) + + self.logger.info(info) + return avg_reward + + def save(self, path): + super(DdpgTrainingStats, self).save(path) + np.save(os.path.join( + path, 'avg_critic_loss'), self._all_avg_critic_loss) + np.save(os.path.join(path, 'avg_actor_loss'), self._all_avg_actor_loss) + np.save(os.path.join(path, 'avg_qvalues'), self._all_avg_qvalues) + + def save_stats(self, path): + fig = plt.figure(figsize=(120, 120)) + + sub_rewards = fig.add_subplot(321) + sub_rewards.set_title('Total Rewards per episode') + sub_rewards.plot( + np.arange(self._max_episodes), self._all_total_rewards) + + sub_actor_loss = fig.add_subplot(322) + sub_actor_loss.set_title('Avg. Actor Loss per episode') + sub_actor_loss.plot( + np.arange(self._max_episodes), self._all_avg_actor_loss) + + sub_critic_loss = fig.add_subplot(323) + sub_critic_loss.set_title('Avg. Critic Loss per episode') + sub_critic_loss.plot( + np.arange(self._max_episodes), self._all_avg_critic_loss) + + sub_qvalues = fig.add_subplot(324) + sub_qvalues.set_title('Avg. QValues per episode') + sub_qvalues.plot( + np.arange(self._max_episodes), self._all_avg_qvalues) + + sub_eps = fig.add_subplot(325) + sub_eps.set_title('Epsilon per episode') + sub_eps.plot(np.arange(self._max_episodes), self._all_eps) + + sub_rewards = fig.add_subplot(326) + sub_rewards.set_title('Avg. mean reward of last 100 episodes') + sub_rewards.plot(np.arange(self._max_episodes), + self._all_mean_reward_last_100_episodes) - plt.savefig(file) \ No newline at end of file + self.save(path) + plt.savefig(os.path.join(path, 'stats.pdf')) diff --git a/src/test/resources/target_code/ReinforcementConfig2/CNNTrainer_reinforcementConfig2.py b/src/test/resources/target_code/ReinforcementConfig2/CNNTrainer_reinforcementConfig2.py index afcfdbb..39ecc4b 100644 --- a/src/test/resources/target_code/ReinforcementConfig2/CNNTrainer_reinforcementConfig2.py +++ b/src/test/resources/target_code/ReinforcementConfig2/CNNTrainer_reinforcementConfig2.py @@ -1,29 +1,27 @@ from reinforcement_learning.agent import DqnAgent from reinforcement_learning.util import AgentSignalHandler +from reinforcement_learning.cnnarch_logger import ArchLogger import reinforcement_learning.environment import CNNCreator_reinforcementConfig2 import os import sys import re -import logging +import time +import numpy as np import mxnet as mx -session_output_dir = 'session' -agent_name='reinforcement_agent' -session_param_output = os.path.join(session_output_dir, agent_name) -def resume_session(): - session_param_output = os.path.join(session_output_dir, agent_name) +def resume_session(sessions_dir): resume_session = False resume_directory = None - if os.path.isdir(session_output_dir) and os.path.isdir(session_param_output): + if os.path.isdir(sessions_dir): regex = re.compile(r'\d\d\d\d-\d\d-\d\d-\d\d-\d\d') - dir_content = os.listdir(session_param_output) + dir_content = os.listdir(sessions_dir) session_files = filter(regex.search, dir_content) session_files.sort(reverse=True) for d in session_files: - interrupted_session_dir = os.path.join(session_param_output, d, '.interrupted_session') + interrupted_session_dir = os.path.join(sessions_dir, d, '.interrupted_session') if os.path.isdir(interrupted_session_dir): resume = raw_input('Interrupted session from {} found. Do you want to resume? (y/n) '.format(d)) if resume == 'y': @@ -32,70 +30,89 @@ def resume_session(): break return resume_session, resume_directory + if __name__ == "__main__": + agent_name = 'reinforcement_agent' + # Prepare output directory and logger + all_output_dir = os.path.join('model', agent_name) + output_directory = os.path.join( + all_output_dir, + time.strftime('%Y-%m-%d-%H-%M-%S', + time.localtime(time.time()))) + ArchLogger.set_output_directory(output_directory) + ArchLogger.set_logger_name(agent_name) + ArchLogger.set_output_level(ArchLogger.INFO) + env = reinforcement_learning.environment.GymEnvironment('CartPole-v1') - context = mx.cpu() - net_creator = CNNCreator_reinforcementConfig2.CNNCreator_reinforcementConfig2() - net_creator.construct(context) - replay_memory_params = { - 'method':'buffer', - 'memory_size':10000, - 'sample_size':32, - 'state_dtype':'float32', - 'action_dtype':'uint8', - 'rewards_dtype':'float32' - } + context = mx.cpu() + qnet_creator = CNNCreator_reinforcementConfig2.CNNCreator_reinforcementConfig2() + qnet_creator.construct(context) - policy_params = { - 'method':'epsgreedy', - 'epsilon': 1, - 'min_epsilon': 0.01, - 'epsilon_decay_method': 'linear', - 'epsilon_decay': 0.0001, + agent_params = { + 'environment': env, + 'replay_memory_params': { + 'method': 'buffer', + 'memory_size': 10000, + 'sample_size': 32, + 'state_dtype': 'float32', + 'action_dtype': 'float32', + 'rewards_dtype': 'float32' + }, + 'strategy_params': { + 'method':'epsgreedy', + 'epsilon': 1, + 'min_epsilon': 0.01, + 'epsilon_decay_method': 'linear', + 'epsilon_decay': 0.0001, + 'epsilon_decay_start': 20, + }, + 'agent_name': agent_name, + 'verbose': True, + 'output_directory': output_directory, + 'state_dim': (8,), + 'action_dim': (3,), + 'discount_factor': 0.999, + 'training_episodes': 200, + 'train_interval': 1, + 'snapshot_interval': 20, + 'max_episode_step': 250, + 'evaluation_samples': 100, + 'target_score': 185.5, + 'qnet':qnet_creator.net, + 'use_fix_target': False, + 'loss_function': 'euclidean', + 'optimizer': 'rmsprop', + 'optimizer_params': { + 'weight_decay': 0.01, + 'centered': True, + 'gamma2': 0.9, + 'gamma1': 0.9, + 'clip_weights': 10.0, + 'learning_rate_decay': 0.9, + 'epsilon': 1.0E-6, + 'rescale_grad': 1.1, + 'clip_gradient': 10.0, + 'learning_rate_minimum': 1.0E-5, + 'learning_rate_policy': 'step', + 'learning_rate': 0.001, + 'step_size': 1000 }, + 'double_dqn': False, } - resume_session, resume_directory = resume_session() + resume, resume_directory = resume_session(all_output_dir) - if resume_session: - agent = DqnAgent.resume_from_session(resume_directory, net_creator.net, env) + if resume: + output_directory, _ = os.path.split(resume_directory) + ArchLogger.set_output_directory(output_directory) + resume_agent_params = { + 'session_dir': resume_directory, + 'environment': env, + 'net': qnet_creator.net, + } + agent = DqnAgent.resume_from_session(**resume_agent_params) else: - agent = DqnAgent( - network = net_creator.net, - environment=env, - replay_memory_params=replay_memory_params, - policy_params=policy_params, - state_dim=net_creator.get_input_shapes()[0], - discount_factor=0.999, - loss_function='euclidean', - optimizer='rmsprop', - optimizer_params={ - 'weight_decay': 0.01, - 'centered': True, - 'gamma2': 0.9, - 'gamma1': 0.9, - 'clip_weights': 10.0, - 'learning_rate_decay': 0.9, - 'epsilon': 1.0E-6, - 'rescale_grad': 1.1, - 'clip_gradient': 10.0, - 'learning_rate_minimum': 1.0E-5, - 'learning_rate_policy': 'step', - 'learning_rate': 0.001, - 'step_size': 1000 }, - training_episodes=200, - train_interval=1, - use_fix_target=False, - double_dqn = False, - snapshot_interval=20, - agent_name=agent_name, - max_episode_step=250, - output_directory=session_output_dir, - verbose=True, - live_plot = True, - make_logfile=True, - target_score=185.5 - ) + agent = DqnAgent(**agent_params) signal_handler = AgentSignalHandler() signal_handler.register_agent(agent) @@ -103,4 +120,4 @@ if __name__ == "__main__": train_successful = agent.train() if train_successful: - agent.save_best_network(net_creator._model_dir_ + net_creator._model_prefix_ + '_newest', epoch=0) \ No newline at end of file + agent.save_best_network(qnet_creator._model_dir_ + qnet_creator._model_prefix_ + '_newest', epoch=0) diff --git a/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/action_policy.py b/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/action_policy.py deleted file mode 100644 index f43a211..0000000 --- a/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/action_policy.py +++ /dev/null @@ -1,73 +0,0 @@ -import numpy as np - -class ActionPolicyBuilder(object): - def __init__(self): - pass - - def build_by_params(self, - method='epsgreedy', - epsilon=0.5, - min_epsilon=0.05, - epsilon_decay_method='no', - epsilon_decay=0.0, - action_dim=None): - - if epsilon_decay_method == 'linear': - decay = LinearDecay(eps_decay=epsilon_decay, min_eps=min_epsilon) - else: - decay = NoDecay() - - if method == 'epsgreedy': - assert action_dim is not None - assert len(action_dim) == 1 - return EpsilonGreedyActionPolicy(eps=epsilon, - number_of_actions=action_dim[0], decay=decay) - else: - assert action_dim is not None - assert len(action_dim) == 1 - return GreedyActionPolicy() - -class EpsilonGreedyActionPolicy(object): - def __init__(self, eps, number_of_actions, decay): - self.eps = eps - self.cur_eps = eps - self.__number_of_actions = number_of_actions - self.__decay_method = decay - - def select_action(self, values): - do_exploration = (np.random.rand() < self.cur_eps) - if do_exploration: - action = np.random.randint(low=0, high=self.__number_of_actions) - else: - action = values.asnumpy().argmax() - return action - - def decay(self): - self.cur_eps = self.__decay_method.decay(self.cur_eps) - - -class GreedyActionPolicy(object): - def __init__(self): - pass - - def select_action(self, values): - return values.asnumpy().argmax() - - def decay(self): - pass - - -class NoDecay(object): - def __init__(self): - pass - - def decay(self, cur_eps): - return cur_eps - -class LinearDecay(object): - def __init__(self, eps_decay, min_eps=0): - self.eps_decay = eps_decay - self.min_eps = min_eps - - def decay(self, cur_eps): - return max(cur_eps - self.eps_decay, self.min_eps) \ No newline at end of file diff --git a/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/agent.py b/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/agent.py index acbc974..c06b32a 100644 --- a/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/agent.py +++ b/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/agent.py @@ -2,118 +2,382 @@ import mxnet as mx import numpy as np import time import os -import logging import sys import util import matplotlib.pyplot as plt +import pyprind +from cnnarch_logger import ArchLogger from replay_memory import ReplayMemoryBuilder -from action_policy import ActionPolicyBuilder -from util import copy_net, get_loss_function +from strategy import StrategyBuilder +from util import copy_net, get_loss_function,\ + copy_net_with_two_inputs, DdpgTrainingStats, DqnTrainingStats,\ + make_directory_if_not_exist from mxnet import nd, gluon, autograd -class DqnAgent(object): - def __init__(self, - network, + +class Agent(object): + def __init__( + self, environment, replay_memory_params, - policy_params, + strategy_params, state_dim, + action_dim, ctx=None, discount_factor=.9, - loss_function='euclidean', - optimizer='rmsprop', - optimizer_params = {'learning_rate':0.09}, training_episodes=50, train_interval=1, - use_fix_target=False, - double_dqn = False, - target_update_interval=10, + start_training=0, snapshot_interval=200, - agent_name='Dqn_agent', + agent_name='Agent', max_episode_step=99999, + evaluation_samples=1000, output_directory='model_parameters', verbose=True, - live_plot = True, - make_logfile=True, - target_score=None): - assert 0 < discount_factor <= 1 - assert train_interval > 0 - assert target_update_interval > 0 - assert snapshot_interval > 0 - assert max_episode_step > 0 - assert training_episodes > 0 - assert replay_memory_params is not None - assert type(state_dim) is tuple - - self.__ctx = mx.gpu() if ctx == 'gpu' else mx.cpu() - self.__qnet = network - - self.__environment = environment - self.__discount_factor = discount_factor - self.__training_episodes = training_episodes - self.__train_interval = train_interval - self.__verbose = verbose - self.__state_dim = state_dim - self.__action_dim = self.__qnet(nd.random_normal(shape=((1,) + self.__state_dim), ctx=self.__ctx)).shape[1:] + target_score=None + ): + assert 0 < discount_factor <= 1,\ + 'Discount factor must be between 0 and 1' + assert train_interval > 0, 'Train interval must be greater 0' + assert snapshot_interval > 0, 'Snapshot interval must be greater 0' + assert max_episode_step > 0,\ + 'Maximal steps per episode must be greater 0' + assert training_episodes > 0, 'Trainings episode must be greater 0' + assert replay_memory_params is not None,\ + 'Replay memory parameter not set' + assert type(state_dim) is tuple, 'State dimension is not a tuple' + assert type(action_dim) is tuple, 'Action dimension is not a tuple' + + self._logger = ArchLogger.get_logger() + self._ctx = mx.gpu() if ctx == 'gpu' else mx.cpu() + self._environment = environment + self._discount_factor = discount_factor + self._training_episodes = training_episodes + self._train_interval = train_interval + self._verbose = verbose + self._state_dim = state_dim replay_memory_params['state_dim'] = state_dim - self.__replay_memory_params = replay_memory_params + replay_memory_params['action_dim'] = action_dim + self._replay_memory_params = replay_memory_params rm_builder = ReplayMemoryBuilder() - self.__memory = rm_builder.build_by_params(**replay_memory_params) - self.__minibatch_size = self.__memory.sample_size - - policy_params['action_dim'] = self.__action_dim - self.__policy_params = policy_params - p_builder = ActionPolicyBuilder() - self.__policy = p_builder.build_by_params(**policy_params) - - self.__target_update_interval = target_update_interval - self.__target_qnet = copy_net(self.__qnet, self.__state_dim, ctx=self.__ctx) - self.__loss_function_str = loss_function - self.__loss_function = get_loss_function(loss_function) - self.__agent_name = agent_name - self.__snapshot_interval = snapshot_interval - self.__creation_time = time.time() - self.__max_episode_step = max_episode_step - self.__optimizer = optimizer - self.__optimizer_params = optimizer_params - self.__make_logfile = make_logfile - self.__double_dqn = double_dqn - self.__use_fix_target = use_fix_target - self.__live_plot = live_plot - self.__user_given_directory = output_directory - self.__target_score = target_score - - self.__interrupt_flag = False + self._memory = rm_builder.build_by_params(**replay_memory_params) + self._minibatch_size = self._memory.sample_size + self._action_dim = action_dim + + strategy_params['action_dim'] = self._action_dim + self._strategy_params = strategy_params + strategy_builder = StrategyBuilder() + self._strategy = strategy_builder.build_by_params(**strategy_params) + self._agent_name = agent_name + self._snapshot_interval = snapshot_interval + self._creation_time = time.time() + self._max_episode_step = max_episode_step + self._start_training = start_training + self._output_directory = output_directory + self._target_score = target_score + + self._evaluation_samples = evaluation_samples + self._best_avg_score = -np.infty + self._best_net = None + + self._interrupt_flag = False + self._training_stats = None # Training Context - self.__current_episode = 0 - self.__total_steps = 0 - - # Initialize best network - self.__best_net = copy_net(self.__qnet, self.__state_dim, self.__ctx) - self.__best_avg_score = None + self._current_episode = 0 + self._total_steps = 0 - # Gluon Trainer definition - self.__training_stats = None + @property + def current_episode(self): + return self._current_episode - # Prepare output directory and logger - self.__output_directory = output_directory\ - + '/' + self.__agent_name\ - + '/' + time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(self.__creation_time)) - self.__logger = self.__setup_logging() - self.__logger.info('Agent created with following parameters: {}'.format(self.__make_config_dict())) + @property + def environment(self): + return self._environment - @classmethod - def from_config_file(cls, network, environment, config_file_path, ctx=None): + def save_config_file(self): import json - # Load config - with open(config_file_path, 'r') as config_file: - config_dict = json.load(config_file) - return cls(network, environment, ctx=ctx, **config_dict) + make_directory_if_not_exist(self._output_directory) + filename = os.path.join(self._output_directory, 'config.json') + config = self._make_config_dict() + with open(filename, mode='w') as fp: + json.dump(config, fp, indent=4) + + def set_interrupt_flag(self, interrupt): + self._interrupt_flag = interrupt + + def _interrupt_training(self): + import pickle + self._logger.info('Training interrupted; Store state for resuming') + session_dir = self._get_session_dir() + agent_session_file = os.path.join(session_dir, 'agent.p') + logger = self._logger + + self._make_pickle_ready(session_dir) + + with open(agent_session_file, 'wb') as f: + pickle.dump(self, f, protocol=2) + logger.info('State successfully stored') + + def _make_pickle_ready(self, session_dir): + del self._training_stats.logger + self._logger = None + self._environment.close() + self._environment = None + self._save_net(self._best_net, 'best_net', session_dir) + self._best_net = None + + def _make_config_dict(self): + config = dict() + config['state_dim'] = self._state_dim + config['action_dim'] = self._action_dim + config['ctx'] = str(self._ctx) + config['discount_factor'] = self._discount_factor + config['strategy_params'] = self._strategy_params + config['replay_memory_params'] = self._replay_memory_params + config['training_episodes'] = self._training_episodes + config['start_training'] = self._start_training + config['evaluation_samples'] = self._evaluation_samples + config['train_interval'] = self._train_interval + config['snapshot_interval'] = self._snapshot_interval + config['agent_name'] = self._agent_name + config['max_episode_step'] = self._max_episode_step + config['output_directory'] = self._output_directory + config['verbose'] = self._verbose + config['target_score'] = self._target_score + return config + + def _adjust_optimizer_params(self, optimizer_params): + if 'weight_decay' in optimizer_params: + optimizer_params['wd'] = optimizer_params['weight_decay'] + del optimizer_params['weight_decay'] + if 'learning_rate_decay' in optimizer_params: + min_learning_rate = 1e-8 + if 'learning_rate_minimum' in optimizer_params: + min_learning_rate = optimizer_params['learning_rate_minimum'] + del optimizer_params['learning_rate_minimum'] + optimizer_params['lr_scheduler'] = mx.lr_scheduler.FactorScheduler( + optimizer_params['step_size'], + factor=optimizer_params['learning_rate_decay'], + stop_factor_lr=min_learning_rate) + del optimizer_params['step_size'] + del optimizer_params['learning_rate_decay'] + + return optimizer_params + + def _sample_from_memory(self): + states, actions, rewards, next_states, terminals\ + = self._memory.sample(batch_size=self._minibatch_size) + states = nd.array(states, ctx=self._ctx) + actions = nd.array(actions, ctx=self._ctx) + rewards = nd.array(rewards, ctx=self._ctx) + next_states = nd.array(next_states, ctx=self._ctx) + terminals = nd.array(terminals, ctx=self._ctx) + return states, actions, rewards, next_states, terminals + + def evaluate(self, target=None, sample_games=100, verbose=True): + target = self._target_score if target is None else target + if target: + target_achieved = 0 + total_reward = 0 + + self._logger.info('Sampling from {} games...'.format(sample_games)) + for g in pyprind.prog_bar(range(sample_games)): + state = self._environment.reset() + step = 0 + game_reward = 0 + terminal = False + while not terminal and (step < self._max_episode_step): + action = self.get_next_action(state) + state, reward, terminal, _ = self._environment.step(action) + game_reward += reward + step += 1 + + if verbose: + info = 'Game %d: Reward %f' % (g, game_reward) + self._logger.debug(info) + if target: + if game_reward >= target: + target_achieved += 1 + total_reward += game_reward + + avg_reward = float(total_reward)/float(sample_games) + info = 'Avg. Reward: %f' % avg_reward + if target: + target_achieved_ratio = int( + (float(target_achieved)/float(sample_games))*100) + info += '; Target Achieved in %d%% of games'\ + % (target_achieved_ratio) + + if verbose: + self._logger.info(info) + return avg_reward + + def _do_snapshot_if_in_interval(self, episode): + do_snapshot =\ + (episode != 0 and (episode % self._snapshot_interval == 0)) + if do_snapshot: + self.save_parameters(episode=episode) + self._evaluate() + + def _evaluate(self, verbose=True): + avg_reward = self.evaluate( + sample_games=self._evaluation_samples, verbose=False) + info = 'Evaluation -> Average Reward in {} games: {}'.format( + self._evaluation_samples, avg_reward) + + if self._best_avg_score is None or self._best_avg_score <= avg_reward: + self._save_current_as_best_net() + self._best_avg_score = avg_reward + if verbose: + self._logger.info(info) + + def _is_target_reached(self, avg_reward): + return self._target_score is not None\ + and avg_reward > self._target_score + + def _do_training(self): + return (self._total_steps % self._train_interval == 0) and\ + (self._memory.is_sample_possible(self._minibatch_size)) and\ + (self._current_episode >= self._start_training) + + def _check_interrupt_routine(self): + if self._interrupt_flag: + self._interrupt_flag = False + self._interrupt_training() + return True + return False + + def _is_target_reached(self, avg_reward): + return self._target_score is not None\ + and avg_reward > self._target_score + + def _save_parameters(self, net, episode=None, filename='dqn-agent-params'): + assert self._output_directory + assert isinstance(net, gluon.HybridBlock) + make_directory_if_not_exist(self._output_directory) + + if(episode is not None): + self._logger.info( + 'Saving model parameters after episode %d' % episode) + filename = filename + '-ep{}'.format(episode) + else: + self._logger.info('Saving model parameters') + self._save_net(net, filename) + + def _save_net(self, net, filename, filedir=None): + filedir = self._output_directory if filedir is None else filedir + filename = os.path.join(filedir, filename + '.params') + net.save_parameters(filename) + + def save_best_network(self, path, epoch=0): + self._logger.info( + 'Saving best network with average reward of {}'.format( + self._best_avg_score)) + self._best_net.export(path, epoch=epoch) + + def _get_session_dir(self): + session_dir = os.path.join( + self._output_directory, '.interrupted_session') + make_directory_if_not_exist(session_dir) + return session_dir + + def _save_current_as_best_net(self): + raise NotImplementedError + + def get_next_action(self, state): + raise NotImplementedError + + def save_parameters(self, episode): + raise NotImplementedError + + def train(self, episodes=None): + raise NotImplementedError + + +class DdpgAgent(Agent): + def __init__( + self, + actor, + critic, + environment, + replay_memory_params, + strategy_params, + state_dim, + action_dim, + soft_target_update_rate=.001, + actor_optimizer='adam', + actor_optimizer_params={'learning_rate': 0.0001}, + critic_optimizer='adam', + critic_optimizer_params={'learning_rate': 0.001}, + ctx=None, + discount_factor=.9, + training_episodes=50, + start_training=20, + train_interval=1, + snapshot_interval=200, + agent_name='DdpgAgent', + max_episode_step=9999, + evaluation_samples=100, + output_directory='model_parameters', + verbose=True, + target_score=None + ): + super(DdpgAgent, self).__init__( + environment=environment, replay_memory_params=replay_memory_params, + strategy_params=strategy_params, state_dim=state_dim, + action_dim=action_dim, ctx=ctx, discount_factor=discount_factor, + training_episodes=training_episodes, start_training=start_training, + train_interval=train_interval, + snapshot_interval=snapshot_interval, agent_name=agent_name, + max_episode_step=max_episode_step, + output_directory=output_directory, verbose=verbose, + target_score=target_score, evaluation_samples=evaluation_samples) + assert critic is not None, 'Critic not set' + assert actor is not None, 'Actor is not set' + assert soft_target_update_rate > 0,\ + 'Target update must be greater zero' + assert actor_optimizer is not None, 'No actor optimizer set' + assert critic_optimizer is not None, 'No critic optimizer set' + + self._actor = actor + self._critic = critic + + self._actor_target = self._copy_actor() + self._critic_target = self._copy_critic() + + self._actor_optimizer = actor_optimizer + self._actor_optimizer_params = self._adjust_optimizer_params( + actor_optimizer_params) + + self._critic_optimizer = critic_optimizer + self._critic_optimizer_params = self._adjust_optimizer_params( + critic_optimizer_params) + + self._soft_target_update_rate = soft_target_update_rate + + self._logger.info( + 'Agent created with following parameters: {}'.format( + self._make_config_dict())) + + self._best_net = self._copy_actor() + + self._training_stats = DdpgTrainingStats(self._training_episodes) + + def _make_pickle_ready(self, session_dir): + super(DdpgAgent, self)._make_pickle_ready(session_dir) + self._save_net(self._actor, 'actor', session_dir) + self._actor = None + self._save_net(self._critic, 'critic', session_dir) + self._critic = None + self._save_net(self._actor_target, 'actor_target', session_dir) + self._actor_target = None + self._save_net(self._critic_target, 'critic_target', session_dir) + self._critic_target = None @classmethod - def resume_from_session(cls, session_dir, net, environment): + def resume_from_session(cls, session_dir, actor, critic, environment): import pickle if not os.path.exists(session_dir): raise ValueError('Session directory does not exist') @@ -121,386 +385,516 @@ class DqnAgent(object): files = dict() files['agent'] = os.path.join(session_dir, 'agent.p') files['best_net_params'] = os.path.join(session_dir, 'best_net.params') - files['q_net_params'] = os.path.join(session_dir, 'qnet.params') - files['target_net_params'] = os.path.join(session_dir, 'target_net.params') + files['actor_net_params'] = os.path.join(session_dir, 'actor.params') + files['actor_target_net_params'] = os.path.join( + session_dir, 'actor_target.params') + files['critic_net_params'] = os.path.join(session_dir, 'critic.params') + files['critic_target_net_params'] = os.path.join( + session_dir, 'critic_target.params') for file in files.values(): if not os.path.exists(file): - raise ValueError('Session directory is not complete: {} is missing'.format(file)) + raise ValueError( + 'Session directory is not complete: {} is missing' + .format(file)) with open(files['agent'], 'rb') as f: agent = pickle.load(f) - agent.__environment = environment - agent.__qnet = net - agent.__qnet.load_parameters(files['q_net_params'], agent.__ctx) - agent.__qnet.hybridize() - agent.__qnet(nd.random_normal(shape=((1,) + agent.__state_dim), ctx=agent.__ctx)) - agent.__best_net = copy_net(agent.__qnet, agent.__state_dim, agent.__ctx) - agent.__best_net.load_parameters(files['best_net_params'], agent.__ctx) - agent.__target_qnet = copy_net(agent.__qnet, agent.__state_dim, agent.__ctx) - agent.__target_qnet.load_parameters(files['target_net_params'], agent.__ctx) + agent._environment = environment + + agent._actor = actor + agent._actor.load_parameters(files['actor_net_params'], agent._ctx) + agent._actor.hybridize() + agent._actor(nd.random_normal( + shape=((1,) + agent._state_dim), ctx=agent._ctx)) + + agent._best_net = copy_net(agent._actor, agent._state_dim, agent._ctx) + agent._best_net.load_parameters(files['best_net_params'], agent._ctx) + + agent._actor_target = copy_net( + agent._actor, agent._state_dim, agent._ctx) + agent._actor_target.load_parameters(files['actor_target_net_params']) - agent.__logger = agent.__setup_logging(append=True) - agent.__training_stats.logger = agent.__logger - agent.__logger.info('Agent was retrieved; Training can be continued') + agent._critic = critic + agent._critic.load_parameters(files['critic_net_params'], agent._ctx) + agent._critic.hybridize() + agent._critic( + nd.random_normal(shape=((1,) + agent._state_dim), ctx=agent._ctx), + nd.random_normal(shape=((1,) + agent._action_dim), ctx=agent._ctx)) + + agent._critic_target = copy_net_with_two_inputs( + agent._critic, agent._state_dim, agent._action_dim, agent._ctx) + agent._critic_target.load_parameters(files['critic_target_net_params']) + + agent._logger = ArchLogger.get_logger() + agent._training_stats.logger = ArchLogger.get_logger() + agent._logger.info('Agent was retrieved; Training can be continued') return agent - def __interrupt_training(self): - import pickle - self.__logger.info('Training interrupted; Store state for resuming') - session_dir = os.path.join(self.__output_directory, '.interrupted_session') - if not os.path.exists(session_dir): - os.mkdir(session_dir) + def _save_current_as_best_net(self): + self._best_net = self._copy_actor() - del self.__training_stats.logger - logger = self.__logger - self.__logger = None - self.__environment.close() - self.__environment = None + def get_next_action(self, state): + action = self._actor(nd.array([state], ctx=self._ctx)) + return action[0].asnumpy() - self.__save_net(self.__qnet, 'qnet', session_dir) - self.__qnet = None - self.__save_net(self.__best_net, 'best_net', session_dir) - self.__best_net = None - self.__save_net(self.__target_qnet, 'target_net', session_dir) - self.__target_qnet = None + def save_parameters(self, episode): + self._save_parameters(self._actor, episode=episode) - agent_session_file = os.path.join(session_dir, 'agent.p') + def train(self, episodes=None): + self.save_config_file() + self._logger.info("--- Start DDPG training ---") + episodes = \ + episodes if episodes is not None else self._training_episodes - with open(agent_session_file, 'wb') as f: - pickle.dump(self, f) - self.__logger = logger - logger.info('State successfully stored') + resume = (self._current_episode > 0) + if resume: + self._logger.info("Training session resumed") + self._logger.info( + "Starting from episode {}".format(self._current_episode)) + else: + self._training_stats = DdpgTrainingStats(episodes) - @property - def current_episode(self): - return self.__current_episode + # Initialize target Q' and mu' + self._actor_target = self._copy_actor() + self._critic_target = self._copy_critic() - @property - def environment(self): - return self.__environment + # Initialize l2 loss for critic network + l2_loss = gluon.loss.L2Loss() - def __adjust_optimizer_params(self, optimizer_params): - if 'weight_decay' in optimizer_params: - optimizer_params['wd'] = optimizer_params['weight_decay'] - del optimizer_params['weight_decay'] - if 'learning_rate_decay' in optimizer_params: - min_learning_rate = 1e-8 - if 'learning_rate_minimum' in optimizer_params: - min_learning_rate = optimizer_params['learning_rate_minimum'] - del optimizer_params['learning_rate_minimum'] - optimizer_params['lr_scheduler'] = mx.lr.scheduler.FactorScheduler( - optimizer_params['step_size'], - factor=optimizer_params['learning_rate_decay'], - stop_factor_lr=min_learning_rate) - del optimizer_params['step_size'] - del optimizer_params['learning_rate_decay'] + # Initialize critic and actor trainer + trainer_actor = gluon.Trainer( + self._actor.collect_params(), self._actor_optimizer, + self._actor_optimizer_params) + trainer_critic = gluon.Trainer( + self._critic.collect_params(), self._critic_optimizer, + self._critic_optimizer_params) - return optimizer_params + # For episode=1..n + while self._current_episode < episodes: + # Check interrupt flag + if self._check_interrupt_routine(): + return False - def set_interrupt_flag(self, interrupt): - self.__interrupt_flag = interrupt + # Initialize new episode + step = 0 + episode_reward = 0 + start = time.time() + episode_critic_loss = 0 + episode_actor_loss = 0 + episode_avg_q_value = 0 + training_steps = 0 + # Get initialial observation state s + state = self._environment.reset() + + # For step=1..T + while step < self._max_episode_step: + # Select an action a = mu(s) + N(step) according to current + # actor and exploration noise N according to strategy + action = self._strategy.select_action( + self.get_next_action(state)) + + # Execute action a and observe reward r and next state ns + next_state, reward, terminal, _ = \ + self._environment.step(action) + + self._logger.debug( + 'Applied action {} with reward {}'.format(action, reward)) + + # Store transition (s,a,r,ns) in replay buffer + self._memory.append( + state, action, reward, next_state, terminal) + + if self._do_training(): + # Sample random minibatch of b transitions + # (s_i, a_i, r_i, s_(i+1)) from replay buffer + states, actions, rewards, next_states, terminals =\ + self._sample_from_memory() + + actor_target_actions = self._actor_target(next_states) + critic_target_qvalues = self._critic_target( + next_states, actor_target_actions) + + rewards = rewards.reshape(self._minibatch_size, 1) + terminals = terminals.reshape(self._minibatch_size, 1) + + # y = r_i + discount * Q'(s_(i+1), mu'(s_(i+1))) + y = rewards + (1.0 - terminals) * self._discount_factor\ + * critic_target_qvalues + + # Train the critic network + with autograd.record(): + qvalues = self._critic(states, actions) + critic_loss = l2_loss(qvalues, y) + critic_loss.backward() + trainer_critic.step(self._minibatch_size) + + # Train the actor network + # Temporary critic so that gluon trainer does not mess + # with critic parameters + tmp_critic = self._copy_critic() + with autograd.record(): + actor_qvalues = tmp_critic(states, self._actor(states)) + # For maximizing qvalues we have to multiply with -1 + # as we use a minimizer + actor_loss = -1 * actor_qvalues + actor_loss.backward() + trainer_actor.step(self._minibatch_size) + + # Update target networks: + self._actor_target = self._soft_update( + self._actor, self._actor_target, + self._soft_target_update_rate) + self._critic_target = self._soft_update( + self._critic, self._critic_target, + self._soft_target_update_rate) + + # Update statistics + episode_critic_loss +=\ + np.sum(critic_loss.asnumpy()) / self._minibatch_size + episode_actor_loss +=\ + np.sum(actor_loss.asnumpy()) / self._minibatch_size + episode_avg_q_value +=\ + np.sum(actor_qvalues.asnumpy()) / self._minibatch_size - def __make_output_directory_if_not_exist(self): - assert self.__output_directory - if not os.path.exists(self.__output_directory): - os.makedirs(self.__output_directory) + training_steps += 1 - def __setup_logging(self, append=False): - assert self.__output_directory - assert self.__agent_name + episode_reward += reward + step += 1 + self._total_steps += 1 + state = next_state - output_level = logging.DEBUG if self.__verbose else logging.WARNING - filemode = 'a' if append else 'w' + if terminal: + # Reset the strategy + self._strategy.reset() + break + + # Log the episode results + episode_actor_loss = 0 if training_steps == 0\ + else (episode_actor_loss / training_steps) + episode_critic_loss = 0 if training_steps == 0\ + else (episode_critic_loss / training_steps) + episode_avg_q_value = 0 if training_steps == 0\ + else (episode_avg_q_value / training_steps) + + avg_reward = self._training_stats.log_episode( + self._current_episode, start, training_steps, + episode_actor_loss, episode_critic_loss, episode_avg_q_value, + self._strategy.cur_eps, episode_reward) + + self._do_snapshot_if_in_interval(self._current_episode) + self._strategy.decay(self._current_episode) + + if self._is_target_reached(avg_reward): + self._logger.info( + 'Target score is reached in average; Training is stopped') + break + + self._current_episode += 1 + + self._evaluate() + self.save_parameters(episode=self._current_episode) + self.save_best_network(os.path.join(self._output_directory, 'best')) + self._training_stats.save_stats(self._output_directory) + self._logger.info('--------- Training finished ---------') + return True + + def _make_config_dict(self): + config = super(DdpgAgent, self)._make_config_dict() + config['soft_target_update_rate'] = self._soft_target_update_rate + config['actor_optimizer'] = self._actor_optimizer + config['actor_optimizer_params'] = self._actor_optimizer_params + config['critic_optimizer'] = self._critic_optimizer + config['critic_optimizer_params'] = self._critic_optimizer_params + return config - logformat = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' - dateformat = '%d-%b-%y %H:%M:%S' - formatter = logging.Formatter(fmt=logformat, datefmt=dateformat) + def _soft_update(self, net, target, tau): + net_params = [p.data() for _, p in net.collect_params().items()] + for i, (_, p) in enumerate(target.collect_params().items()): + target_params = p.data() + p.set_data((1.0 - tau) * target_params + tau * net_params[i]) + return target + + def _copy_actor(self): + assert self._actor is not None + assert self._ctx is not None + assert type(self._state_dim) is tuple + return copy_net(self._actor, self._state_dim, ctx=self._ctx) + + def _copy_critic(self): + assert self._critic is not None + assert self._ctx is not None + assert type(self._state_dim) is tuple + assert type(self._action_dim) is tuple + return copy_net_with_two_inputs( + self._critic, self._state_dim, self._action_dim, ctx=self._ctx) + + +class DqnAgent(Agent): + def __init__( + self, + qnet, + environment, + replay_memory_params, + strategy_params, + state_dim, + action_dim, + ctx=None, + discount_factor=.9, + loss_function='euclidean', + optimizer='rmsprop', + optimizer_params={'learning_rate': 0.09}, + training_episodes=50, + start_training=0, + train_interval=1, + use_fix_target=False, + double_dqn=False, + target_update_interval=10, + snapshot_interval=200, + evaluation_samples=100, + agent_name='Dqn_agent', + max_episode_step=99999, + output_directory='model_parameters', + verbose=True, + target_score=None + ): + super(DqnAgent, self).__init__( + environment=environment, replay_memory_params=replay_memory_params, + strategy_params=strategy_params, state_dim=state_dim, + action_dim=action_dim, ctx=ctx, discount_factor=discount_factor, + training_episodes=training_episodes, start_training=start_training, + train_interval=train_interval, + snapshot_interval=snapshot_interval, agent_name=agent_name, + max_episode_step=max_episode_step, + output_directory=output_directory, verbose=verbose, + target_score=target_score, evaluation_samples=evaluation_samples) + + self._qnet = qnet + self._target_update_interval = target_update_interval + self._target_qnet = copy_net( + self._qnet, self._state_dim, ctx=self._ctx) + self._loss_function_str = loss_function + self._loss_function = get_loss_function(loss_function) + self._optimizer = optimizer + self._optimizer_params = optimizer_params + self._double_dqn = double_dqn + self._use_fix_target = use_fix_target - logger = logging.getLogger('DQNAgent') - logger.setLevel(output_level) + # Initialize best network + self._best_net = copy_net(self._qnet, self._state_dim, self._ctx) + self._best_avg_score = -np.infty - stream_handler = logging.StreamHandler(sys.stdout) - stream_handler.setLevel(output_level) - stream_handler.setFormatter(formatter) - logger.addHandler(stream_handler) + self._training_stats = None - if self.__make_logfile: - self.__make_output_directory_if_not_exist() - log_file = os.path.join(self.__output_directory, self.__agent_name + '.log') - file_handler = logging.FileHandler(log_file, mode=filemode) - file_handler.setLevel(output_level) - file_handler.setFormatter(formatter) - logger.addHandler(file_handler) + @classmethod + def resume_from_session(cls, session_dir, net, environment): + import pickle + if not os.path.exists(session_dir): + raise ValueError('Session directory does not exist') - return logger + files = dict() + files['agent'] = os.path.join(session_dir, 'agent.p') + files['best_net_params'] = os.path.join(session_dir, 'best_net.params') + files['q_net_params'] = os.path.join(session_dir, 'qnet.params') + files['target_net_params'] = os.path.join( + session_dir, 'target_net.params') - def __is_target_reached(self, avg_reward): - return self.__target_score is not None\ - and avg_reward > self.__target_score + for file in files.values(): + if not os.path.exists(file): + raise ValueError( + 'Session directory is not complete: {} is missing' + .format(file)) + with open(files['agent'], 'rb') as f: + agent = pickle.load(f) + + agent._environment = environment + agent._qnet = net + agent._qnet.load_parameters(files['q_net_params'], agent._ctx) + agent._qnet.hybridize() + agent._qnet(nd.random_normal( + shape=((1,) + agent._state_dim), ctx=agent._ctx)) + agent._best_net = copy_net(agent._qnet, agent._state_dim, agent._ctx) + agent._best_net.load_parameters(files['best_net_params'], agent._ctx) + agent._target_qnet = copy_net( + agent._qnet, agent._state_dim, agent._ctx) + agent._target_qnet.load_parameters( + files['target_net_params'], agent._ctx) + + agent._logger = ArchLogger.get_logger() + agent._training_stats.logger = agent._logger + agent._logger.info('Agent was retrieved; Training can be continued') + + return agent + + def _make_pickle_ready(self, session_dir): + super(DqnAgent, self)._make_pickle_ready(session_dir) + self._save_net(self._qnet, 'qnet', session_dir) + self._qnet = None + self._save_net(self._target_qnet, 'target_net', session_dir) + self._target_qnet = None def get_q_values(self, state, with_best=False): - return self.get_batch_q_values(nd.array([state], ctx=self.__ctx), with_best=with_best)[0] + return self.get_batch_q_values( + nd.array([state], ctx=self._ctx), with_best=with_best)[0] def get_batch_q_values(self, state_batch, with_best=False): - return self.__best_net(state_batch) if with_best else self.__qnet(state_batch) + return self._best_net(state_batch)\ + if with_best else self._qnet(state_batch) def get_next_action(self, state, with_best=False): q_values = self.get_q_values(state, with_best=with_best) action = q_values.asnumpy().argmax() - return q_values.asnumpy().argmax() + return action - def __sample_from_memory(self): - states, actions, rewards, next_states, terminals\ - = self.__memory.sample(batch_size=self.__minibatch_size) - states = nd.array(states, ctx=self.__ctx) - actions = nd.array(actions, ctx=self.__ctx) - rewards = nd.array(rewards, ctx=self.__ctx) - next_states = nd.array(next_states, ctx=self.__ctx) - terminals = nd.array(terminals, ctx=self.__ctx) - return states, actions, rewards, next_states, terminals - - def __determine_target_q_values(self, states, actions, rewards, next_states, terminals): - if self.__use_fix_target: - q_max_val = self.__target_qnet(next_states) + def __determine_target_q_values( + self, states, actions, rewards, next_states, terminals + ): + if self._use_fix_target: + q_max_val = self._target_qnet(next_states) else: - q_max_val = self.__qnet(next_states) + q_max_val = self._qnet(next_states) - if self.__double_dqn: - q_values_next_states = self.__qnet(next_states) - target_rewards = rewards + nd.choose_element_0index(q_max_val, nd.argmax_channel(q_values_next_states))\ - * (1.0 - terminals) * self.__discount_factor + if self._double_dqn: + q_values_next_states = self._qnet(next_states) + target_rewards = rewards + nd.choose_element_0index( + q_max_val, nd.argmax_channel(q_values_next_states))\ + * (1.0 - terminals) * self._discount_factor else: - target_rewards = rewards + nd.choose_element_0index(q_max_val, nd.argmax_channel(q_max_val))\ - * (1.0 - terminals) * self.__discount_factor + target_rewards = rewards + nd.choose_element_0index( + q_max_val, nd.argmax_channel(q_max_val))\ + * (1.0 - terminals) * self._discount_factor - target_qval = self.__qnet(states) + target_qval = self._qnet(states) for t in range(target_rewards.shape[0]): target_qval[t][actions[t]] = target_rewards[t] return target_qval def __train_q_net_step(self, trainer): - states, actions, rewards, next_states, terminals = self.__sample_from_memory() - target_qval = self.__determine_target_q_values(states, actions, rewards, next_states, terminals) + states, actions, rewards, next_states, terminals =\ + self._sample_from_memory() + target_qval = self.__determine_target_q_values( + states, actions, rewards, next_states, terminals) with autograd.record(): - q_values = self.__qnet(states) - loss = self.__loss_function(q_values, target_qval) + q_values = self._qnet(states) + loss = self._loss_function(q_values, target_qval) loss.backward() - trainer.step(self.__minibatch_size) + trainer.step(self._minibatch_size) return loss - def __do_snapshot_if_in_interval(self, episode): - do_snapshot = (episode != 0 and (episode % self.__snapshot_interval == 0)) - if do_snapshot: - self.save_parameters(episode=episode) - self.__evaluate() - def __do_target_update_if_in_interval(self, total_steps): - do_target_update = (self.__use_fix_target and total_steps % self.__target_update_interval == 0) + do_target_update = ( + self._use_fix_target and + (total_steps % self._target_update_interval == 0)) if do_target_update: - self.__logger.info('Target network is updated after {} steps'.format(total_steps)) - self.__target_qnet = copy_net(self.__qnet, self.__state_dim, self.__ctx) + self._logger.info( + 'Target network is updated after {} steps'.format(total_steps)) + self._target_qnet = copy_net( + self._qnet, self._state_dim, self._ctx) def train(self, episodes=None): - self.__logger.info("--- Start training ---") - trainer = gluon.Trainer(self.__qnet.collect_params(), self.__optimizer, self.__adjust_optimizer_params(self.__optimizer_params)) - episodes = episodes if episodes != None else self.__training_episodes - - resume = (self.__current_episode > 0) + self.save_config_file() + self._logger.info("--- Start training ---") + trainer = gluon.Trainer( + self._qnet.collect_params(), + self._optimizer, + self._adjust_optimizer_params(self._optimizer_params)) + episodes = episodes if episodes is not None\ + else self._training_episodes + + resume = (self._current_episode > 0) if resume: - self.__logger.info("Training session resumed") - self.__logger.info("Starting from episode {}".format(self.__current_episode)) + self._logger.info("Training session resumed") + self._logger.info("Starting from episode {}".format( + self._current_episode)) else: - self.__training_stats = util.TrainingStats(self.__logger, episodes, self.__live_plot) + self._training_stats = DqnTrainingStats(episodes) - # Implementation Deep Q Learning described by Mnih et. al. in Playing Atari with Deep Reinforcement Learning - while self.__current_episode < episodes: - # Check interrupt flag - if self.__interrupt_flag: - self.__interrupt_flag = False - self.__interrupt_training() + # Implementation Deep Q Learning described by + # Mnih et. al. in Playing Atari with Deep Reinforcement Learning + while self._current_episode < episodes: + if self._check_interrupt_routine(): return False step = 0 episode_reward = 0 start = time.time() - state = self.__environment.reset() + state = self._environment.reset() episode_loss = 0 training_steps = 0 - while step < self.__max_episode_step: - #1. Choose an action based on current game state and policy - q_values = self.__qnet(nd.array([state], ctx=self.__ctx)) - action = self.__policy.select_action(q_values[0]) + while step < self._max_episode_step: + # 1. Choose an action based on current game state and policy + q_values = self._qnet(nd.array([state], ctx=self._ctx)) + action = self._strategy.select_action(q_values[0]) - #2. Play the game for a single step - next_state, reward, terminal, _ = self.__environment.step(action) + # 2. Play the game for a single step + next_state, reward, terminal, _ =\ + self._environment.step(action) - #3. Store transition in replay memory - self.__memory.append(state, action, reward, next_state, terminal) + # 3. Store transition in replay memory + self._memory.append( + state, action, reward, next_state, terminal) - #4. Train the network if in interval - do_training = (self.__total_steps % self.__train_interval == 0\ - and self.__memory.is_sample_possible(self.__minibatch_size)) - if do_training: + # 4. Train the network if in interval + if self._do_training(): loss = self.__train_q_net_step(trainer) - loss_sum = sum(loss).asnumpy()[0] - episode_loss += float(loss_sum)/float(self.__minibatch_size) training_steps += 1 + episode_loss +=\ + np.sum(loss.asnumpy()) / self._minibatch_size # Update target network if in interval - self.__do_target_update_if_in_interval(self.__total_steps) + self.__do_target_update_if_in_interval(self._total_steps) step += 1 - self.__total_steps += 1 + self._total_steps += 1 episode_reward += reward state = next_state if terminal: - episode_loss = episode_loss if training_steps > 0 else None - _, _, avg_reward = self.__training_stats.log_episode(self.__current_episode, start, training_steps, - episode_loss, self.__policy.cur_eps, episode_reward) + self._strategy.reset() break - self.__do_snapshot_if_in_interval(self.__current_episode) - self.__policy.decay() - - if self.__is_target_reached(avg_reward): - self.__logger.info('Target score is reached in average; Training is stopped') - break - - self.__current_episode += 1 + self._do_snapshot_if_in_interval(self._current_episode) - self.__evaluate() - training_stats_file = os.path.join(self.__output_directory, 'training_stats.pdf') - self.__training_stats.save_stats(training_stats_file) - self.__logger.info('--------- Training finished ---------') - return True - - def __save_net(self, net, filename, filedir=None): - filedir = self.__output_directory if filedir is None else filedir - filename = os.path.join(filedir, filename + '.params') - net.save_parameters(filename) + episode_loss = (episode_loss / training_steps)\ + if training_steps > 0 else 0 + avg_reward = self._training_stats.log_episode( + self._current_episode, start, training_steps, + episode_loss, self._strategy.cur_eps, episode_reward) + self._strategy.decay(self._current_episode) - def save_parameters(self, episode=None, filename='dqn-agent-params'): - assert self.__output_directory - self.__make_output_directory_if_not_exist() - - if(episode != None): - self.__logger.info('Saving model parameters after episode %d' % episode) - filename = filename + '-ep{}'.format(episode) - else: - self.__logger.info('Saving model parameters') - self.__save_net(self.__qnet, filename) - - def evaluate(self, target=None, sample_games=100, verbose=True): - target = self.__target_score if target is None else target - if target: - target_achieved = 0 - total_reward = 0 - - for g in range(sample_games): - state = self.__environment.reset() - step = 0 - game_reward = 0 - while step < self.__max_episode_step: - action = self.get_next_action(state) - state, reward, terminal, _ = self.__environment.step(action) - game_reward += reward - - if terminal: - if verbose: - info = 'Game %d: Reward %f' % (g,game_reward) - self.__logger.debug(info) - if target: - if game_reward >= target: - target_achieved += 1 - total_reward += game_reward - break - - step += 1 - - avg_reward = float(total_reward)/float(sample_games) - info = 'Avg. Reward: %f' % avg_reward - if target: - target_achieved_ratio = int((float(target_achieved)/float(sample_games))*100) - info += '; Target Achieved in %d%% of games' % (target_achieved_ratio) - - if verbose: - self.__logger.info(info) - return avg_reward - - def __evaluate(self, verbose=True): - sample_games = 100 - avg_reward = self.evaluate(sample_games=sample_games, verbose=False) - info = 'Evaluation -> Average Reward in {} games: {}'.format(sample_games, avg_reward) - - if self.__best_avg_score is None or self.__best_avg_score <= avg_reward: - self.__best_net = copy_net(self.__qnet, self.__state_dim, self.__ctx) - self.__best_avg_score = avg_reward - info += ' (NEW BEST)' - - if verbose: - self.__logger.info(info) - - - - def play(self, update_frame=1, with_best=False): - step = 0 - state = self.__environment.reset() - total_reward = 0 - while step < self.__max_episode_step: - action = self.get_next_action(state, with_best=with_best) - state, reward, terminal, _ = self.__environment.step(action) - total_reward += reward - do_update_frame = (step % update_frame == 0) - if do_update_frame: - self.__environment.render() - time.sleep(.100) - - if terminal: + if self._is_target_reached(avg_reward): + self._logger.info( + 'Target score is reached in average; Training is stopped') break - step += 1 - return total_reward + self._current_episode += 1 - def save_best_network(self, path, epoch=0): - self.__logger.info('Saving best network with average reward of {}'.format(self.__best_avg_score)) - self.__best_net.export(path, epoch=epoch) + self._evaluate() + self.save_parameters(episode=self._current_episode) + self.save_best_network(os.path.join(self._output_directory, 'best')) + self._training_stats.save_stats(self._output_directory) + self._logger.info('--------- Training finished ---------') + return True - def __make_config_dict(self): - config = dict() - config['discount_factor'] = self.__discount_factor - config['optimizer'] = self.__optimizer - config['optimizer_params'] = self.__optimizer_params - config['policy_params'] = self.__policy_params - config['replay_memory_params'] = self.__replay_memory_params - config['loss_function'] = self.__loss_function_str - config['optimizer'] = self.__optimizer - config['training_episodes'] = self.__training_episodes - config['train_interval'] = self.__train_interval - config['use_fix_target'] = self.__use_fix_target - config['double_dqn'] = self.__double_dqn - config['target_update_interval'] = self.__target_update_interval - config['snapshot_interval']= self.__snapshot_interval - config['agent_name'] = self.__agent_name - config['max_episode_step'] = self.__max_episode_step - config['output_directory'] = self.__user_given_directory - config['verbose'] = self.__verbose - config['live_plot'] = self.__live_plot - config['make_logfile'] = self.__make_logfile - config['target_score'] = self.__target_score + def _make_config_dict(self): + config = super(DqnAgent, self)._make_config_dict() + config['optimizer'] = self._optimizer + config['optimizer_params'] = self._optimizer_params + config['loss_function'] = self._loss_function_str + config['use_fix_target'] = self._use_fix_target + config['double_dqn'] = self._double_dqn + config['target_update_interval'] = self._target_update_interval return config - def save_config_file(self): - import json - self.__make_output_directory_if_not_exist() - filename = os.path.join(self.__output_directory, 'config.json') - config = self.__make_config_dict() - with open(filename, mode='w') as fp: - json.dump(config, fp, indent=4) \ No newline at end of file + def save_parameters(self, episode): + self._save_parameters(self._qnet, episode=episode) + + def _save_current_as_best_net(self): + self._best_net = copy_net( + self._qnet, (1,) + self._state_dim, ctx=self._ctx) diff --git a/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/cnnarch_logger.py b/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/cnnarch_logger.py new file mode 100644 index 0000000..19f36b1 --- /dev/null +++ b/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/cnnarch_logger.py @@ -0,0 +1,95 @@ +import logging +import sys +import os +import util + + +class ArchLogger(object): + _logger = None + + __output_level = logging.INFO + __logger_name = 'agent' + __output_directory = '.' + __append = True + __logformat = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + __dateformat = '%d-%b-%y %H:%M:%S' + + INFO = logging.INFO + DEBUG = logging.DEBUG + + @staticmethod + def set_output_level(output_level): + assert output_level is not None + ArchLogger.__output_level = output_level + + @staticmethod + def set_logger_name(logger_name): + assert logger_name is not None + ArchLogger.__logger_name = logger_name + + @staticmethod + def set_output_directory(output_directory): + assert output_directory is not None + ArchLogger.__output_directory = output_directory + + @staticmethod + def set_append(append): + assert append is not None + ArchLogger.__append = append + + @staticmethod + def set_log_format(logformat, dateformat): + assert logformat is not None + assert dateformat is not None + ArchLogger.__logformat = logformat + ArchLogger.__dateformat = dateformat + + @staticmethod + def init_logger(make_log_file=True): + assert ArchLogger._logger is None, 'Logger init already called' + filemode = 'a' if ArchLogger.__append else 'w' + formatter = logging.Formatter( + fmt=ArchLogger.__logformat, datefmt=ArchLogger.__dateformat) + + logger = logging.getLogger(ArchLogger.__logger_name) + logger.propagate = False + + if not logger.handlers: + logger.setLevel(ArchLogger.__output_level) + stream_handler = logging.StreamHandler(sys.stdout) + stream_handler.setLevel(ArchLogger.__output_level) + stream_handler.setFormatter(formatter) + logger.addHandler(stream_handler) + + if make_log_file: + util.make_directory_if_not_exist(ArchLogger.__output_directory) + log_file = os.path.join( + ArchLogger.__output_directory, + ArchLogger.__logger_name + '.log') + file_handler = logging.FileHandler(log_file, mode=filemode) + file_handler.setLevel(ArchLogger.__output_level) + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + ArchLogger._logger = logger + + @staticmethod + def get_logger(): + if ArchLogger._logger is None: + ArchLogger.init_logger() + assert ArchLogger._logger is not None + return ArchLogger._logger + +if __name__ == "__main__": + print('=== Test logger ===') + ArchLogger.set_logger_name('TestLogger') + ArchLogger.set_output_directory('test_log') + ArchLogger.init_logger() + logger = ArchLogger.get_logger() + logger.warning('This is a warning') + logger.debug('This is a debug information, which you should not see') + logger.info('This is a normal information') + assert os.path.exists('test_log')\ + and os.path.isfile(os.path.join('test_log', 'TestLogger.log')),\ + 'Test failed: No logfile exists' + import shutil + shutil.rmtree('test_log') \ No newline at end of file diff --git a/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/environment.py b/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/environment.py index 47787a8..381ec9e 100644 --- a/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/environment.py +++ b/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/environment.py @@ -33,13 +33,6 @@ class GymEnvironment(Environment): def state_dim(self): return self.__env.observation_space.shape - @property - def state_dtype(self): - return 'float32' - - @property - def action_dtype(self): - return 'uint8' @property def number_of_actions(self): diff --git a/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/replay_memory.py b/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/replay_memory.py index e66cd93..d22147a 100644 --- a/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/replay_memory.py +++ b/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/replay_memory.py @@ -1,58 +1,90 @@ import numpy as np + class ReplayMemoryBuilder(object): def __init__(self): self.__supported_methods = ['online', 'buffer', 'combined'] - def build_by_params(self, + def build_by_params( + self, state_dim, method='online', state_dtype='float32', + action_dim=(1,), action_dtype='uint8', rewards_dtype='float32', memory_size=1000, - sample_size=32): + sample_size=32 + ): assert state_dim is not None + assert action_dim is not None assert method in self.__supported_methods if method == 'online': - return self.build_online_memory(state_dim=state_dim, state_dtype=state_dtype, - action_dtype=action_dtype, rewards_dtype=rewards_dtype) + return self.build_online_memory( + state_dim=state_dim, state_dtype=state_dtype, + action_dtype=action_dtype, action_dim=action_dim, + rewards_dtype=rewards_dtype) else: assert memory_size is not None and memory_size > 0 assert sample_size is not None and sample_size > 0 if method == 'buffer': - return self.build_buffered_memory(state_dim=state_dim, sample_size=sample_size, - memory_size=memory_size, state_dtype=state_dtype, action_dtype=action_dtype, + return self.build_buffered_memory( + state_dim=state_dim, sample_size=sample_size, + memory_size=memory_size, state_dtype=state_dtype, + action_dim=action_dim, action_dtype=action_dtype, rewards_dtype=rewards_dtype) else: - return self.build_combined_memory(state_dim=state_dim, sample_size=sample_size, - memory_size=memory_size, state_dtype=state_dtype, action_dtype=action_dtype, + return self.build_combined_memory( + state_dim=state_dim, sample_size=sample_size, + memory_size=memory_size, state_dtype=state_dtype, + action_dim=action_dim, action_dtype=action_dtype, rewards_dtype=rewards_dtype) - def build_buffered_memory(self, state_dim, memory_size=1000, sample_size=1, state_dtype='float32', - action_dtype='uint8', rewards_dtype='float32'): + def build_buffered_memory( + self, state_dim, memory_size, sample_size, state_dtype, action_dim, + action_dtype, rewards_dtype + ): assert memory_size > 0 assert sample_size > 0 - return ReplayMemory(state_dim, size=memory_size, sample_size=sample_size, - state_dtype=state_dtype, action_dtype=action_dtype, rewards_dtype=rewards_dtype) - - def build_combined_memory(self, state_dim, memory_size=1000, sample_size=1, state_dtype='float32', - action_dtype='uint8', rewards_dtype='float32'): + return ReplayMemory( + state_dim, size=memory_size, sample_size=sample_size, + state_dtype=state_dtype, action_dim=action_dim, + action_dtype=action_dtype, rewards_dtype=rewards_dtype) + + def build_combined_memory( + self, state_dim, memory_size, sample_size, state_dtype, action_dim, + action_dtype, rewards_dtype + ): assert memory_size > 0 assert sample_size > 0 - return CombinedReplayMemory(state_dim, size=memory_size, sample_size=sample_size, - state_dtype=state_dtype, action_dtype=action_dtype, rewards_dtype=rewards_dtype) + return CombinedReplayMemory( + state_dim, size=memory_size, sample_size=sample_size, + state_dtype=state_dtype, action_dim=action_dim, + action_dtype=action_dtype, rewards_dtype=rewards_dtype) + + def build_online_memory( + self, state_dim, state_dtype, action_dtype, action_dim, rewards_dtype + ): + return OnlineReplayMemory( + state_dim, state_dtype=state_dtype, action_dim=action_dim, + action_dtype=action_dtype, rewards_dtype=rewards_dtype) - def build_online_memory(self, state_dim, state_dtype='float32', action_dtype='uint8', - rewards_dtype='float32'): - return OnlineReplayMemory(state_dim, state_dtype=state_dtype, action_dtype=action_dtype, - rewards_dtype=rewards_dtype) class ReplayMemory(object): - def __init__(self, state_dim, sample_size, size=1000, state_dtype='uint8', action_dtype='uint8', rewards_dtype='float32'): + def __init__( + self, + state_dim, + sample_size, + size=1000, + action_dim=(1,), + state_dtype='float32', + action_dtype='uint8', + rewards_dtype='float32' + ): assert size > 0, "Size must be greater than zero" assert type(state_dim) is tuple, "State dimension must be a tuple" + assert type(action_dim) is tuple, "Action dimension must be a tuple" assert sample_size > 0 self._size = size self._sample_size = sample_size @@ -60,12 +92,15 @@ class ReplayMemory(object): self._pointer = 0 self._state_dim = state_dim self._state_dtype = state_dtype + self._action_dim = action_dim self._action_dtype = action_dtype self._rewards_dtype = rewards_dtype self._states = np.zeros((self._size,) + state_dim, dtype=state_dtype) - self._actions = np.array([0] * self._size, dtype=action_dtype) + self._actions = np.zeros( + (self._size,) + action_dim, dtype=action_dtype) self._rewards = np.array([0] * self._size, dtype=rewards_dtype) - self._next_states = np.zeros((self._size,) + state_dim, dtype=state_dtype) + self._next_states = np.zeros( + (self._size,) + state_dim, dtype=state_dtype) self._terminals = np.array([0] * self._size, dtype='bool') @property @@ -93,17 +128,23 @@ class ReplayMemory(object): self._terminals[index] def is_sample_possible(self, batch_size=None): - batch_size = batch_size if batch_size is not None else self._sample_size + batch_size = batch_size if batch_size is not None\ + else self._sample_size return self._cur_size >= batch_size def sample(self, batch_size=None): - batch_size = batch_size if batch_size is not None else self._sample_size - assert self._cur_size >= batch_size, "Size of replay memory must be larger than batch size" - i=0 - states = np.zeros((batch_size,)+self._state_dim, dtype=self._state_dtype) - actions = np.zeros(batch_size, dtype=self._action_dtype) + batch_size = batch_size if batch_size is not None\ + else self._sample_size + assert self._cur_size >= batch_size,\ + "Size of replay memory must be larger than batch size" + i = 0 + states = np.zeros(( + batch_size,)+self._state_dim, dtype=self._state_dtype) + actions = np.zeros( + (batch_size,)+self._action_dim, dtype=self._action_dtype) rewards = np.zeros(batch_size, dtype=self._rewards_dtype) - next_states = np.zeros((batch_size,)+self._state_dim, dtype=self._state_dtype) + next_states = np.zeros( + (batch_size,)+self._state_dim, dtype=self._state_dtype) terminals = np.zeros(batch_size, dtype='bool') while i < batch_size: @@ -119,25 +160,35 @@ class ReplayMemory(object): class OnlineReplayMemory(ReplayMemory): - def __init__(self, state_dim, state_dtype='float32', action_dtype='uint8', rewards_dtype='float32'): - super(OnlineReplayMemory, self).__init__(state_dim, sample_size=1, size=1, - state_dtype=state_dtype, action_dtype=action_dtype, rewards_dtype=rewards_dtype) + def __init__( + self, state_dim, state_dtype='float32', action_dim=(1,), + action_dtype='uint8', rewards_dtype='float32' + ): + super(OnlineReplayMemory, self).__init__( + state_dim, sample_size=1, size=1, state_dtype=state_dtype, + action_dim=action_dim, action_dtype=action_dtype, + rewards_dtype=rewards_dtype) class CombinedReplayMemory(ReplayMemory): - def __init__(self, state_dim, sample_size, size=1000, - state_dtype='uint8', action_dtype='uint8', rewards_dtype='float32'): - super(CombinedReplayMemory, self).__init__(state_dim, sample_size=(sample_size - 1), size=size, - state_dtype=state_dtype, action_dtype=action_dtype, rewards_dtype=rewards_dtype) + def __init__( + self, state_dim, sample_size, size=1000, state_dtype='float32', + action_dim=(1,), action_dtype='uint8', rewards_dtype='float32' + ): + super(CombinedReplayMemory, self).__init__( + state_dim=state_dim, sample_size=(sample_size - 1), size=size, + state_dtype=state_dtype, action_dim=action_dim, + action_dtype=action_dtype, rewards_dtype=rewards_dtype) self._last_state = np.zeros((1,) + state_dim, dtype=state_dtype) - self._last_action = np.array([0], dtype=action_dtype) + self._last_action = np.array((1,) + action_dim, dtype=action_dtype) self._last_reward = np.array([0], dtype=rewards_dtype) self._last_next_state = np.zeros((1,) + state_dim, dtype=state_dtype) self._last_terminal = np.array([0], dtype='bool') def append(self, state, action, reward, next_state, terminal): - super(CombinedReplayMemory, self).append(state, action, reward, next_state, terminal) + super(CombinedReplayMemory, self).append( + state, action, reward, next_state, terminal) self._last_state = state self._last_action = action self._last_reward = reward @@ -145,8 +196,10 @@ class CombinedReplayMemory(ReplayMemory): self._last_terminal = terminal def sample(self, batch_size=None): - batch_size = (batch_size-1) if batch_size is not None else self._sample_size - states, actions, rewards, next_states, terminals = super(CombinedReplayMemory, self).sample(batch_size=batch_size) + batch_size = (batch_size-1) if batch_size is not None\ + else self._sample_size + states, actions, rewards, next_states, terminals = super( + CombinedReplayMemory, self).sample(batch_size=batch_size) states = np.append(states, [self._last_state], axis=0) actions = np.append(actions, [self._last_action], axis=0) rewards = np.append(rewards, [self._last_reward], axis=0) diff --git a/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/strategy.py b/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/strategy.py new file mode 100644 index 0000000..4c47adc --- /dev/null +++ b/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/strategy.py @@ -0,0 +1,172 @@ +import numpy as np + + +class StrategyBuilder(object): + def __init__(self): + pass + + def build_by_params( + self, + method='epsgreedy', + epsilon=0.5, + min_epsilon=0.05, + epsilon_decay_method='no', + epsilon_decay=0.0, + epsilon_decay_start=0, + action_dim=None, + action_low=None, + action_high=None, + mu=0.0, + theta=0.5, + sigma=0.3 + ): + + if epsilon_decay_method == 'linear': + decay = LinearDecay( + eps_decay=epsilon_decay, min_eps=min_epsilon, + decay_start=epsilon_decay_start) + else: + decay = NoDecay() + + if method == 'epsgreedy': + assert action_dim is not None + assert len(action_dim) == 1 + return EpsilonGreedyStrategy( + eps=epsilon, number_of_actions=action_dim[0], + decay_method=decay) + elif method == 'ornstein_uhlenbeck': + assert action_dim is not None + assert action_low is not None + assert action_high is not None + assert mu is not None + assert theta is not None + assert sigma is not None + return OrnsteinUhlenbeckStrategy( + action_dim, action_low, action_high, epsilon, mu, theta, + sigma, decay) + else: + assert action_dim is not None + assert len(action_dim) == 1 + return GreedyStrategy() + + +class BaseDecay(object): + def __init__(self): + pass + + def decay(self, *args): + raise NotImplementedError + + def __call__(self, *args): + return self.decay(*args) + + +class NoDecay(BaseDecay): + def __init__(self): + super(NoDecay, self).__init__() + + def decay(self, cur_eps, episode): + return cur_eps + + +class LinearDecay(BaseDecay): + def __init__(self, eps_decay, min_eps=0, decay_start=0): + super(LinearDecay, self).__init__() + self.eps_decay = eps_decay + self.min_eps = min_eps + self.decay_start = decay_start + + def decay(self, cur_eps, episode): + if episode < self.decay_start: + return cur_eps + else: + return max(cur_eps - self.eps_decay, self.min_eps) + + +class BaseStrategy(object): + def __init__(self, decay_method): + self._decay_method = decay_method + + def select_action(self, values, decay_method): + raise NotImplementedError + + def decay(self, episode): + self.cur_eps = self._decay_method.decay(self.cur_eps, episode) + + def reset(self): + pass + + +class EpsilonGreedyStrategy(BaseStrategy): + def __init__(self, eps, number_of_actions, decay_method): + super(EpsilonGreedyStrategy, self).__init__(decay_method) + self.eps = eps + self.cur_eps = eps + self.__number_of_actions = number_of_actions + + def select_action(self, values): + do_exploration = (np.random.rand() < self.cur_eps) + if do_exploration: + action = np.random.randint(low=0, high=self.__number_of_actions) + else: + action = values.asnumpy().argmax() + return action + + +class GreedyStrategy(BaseStrategy): + def __init__(self): + super(GreedyStrategy, self).__init__(None) + + def select_action(self, values): + return values.asnumpy().argmax() + + def decay(self): + pass + + +class OrnsteinUhlenbeckStrategy(BaseStrategy): + """ + Ornstein-Uhlenbeck process: dxt = theta * (mu - xt) * dt + sigma * dWt + where Wt denotes the Wiener process. + """ + def __init__( + self, + action_dim, + action_low, + action_high, + eps, + mu=0.0, + theta=.15, + sigma=.3, + decay=NoDecay() + ): + super(OrnsteinUhlenbeckStrategy, self).__init__(decay) + self.eps = eps + self.cur_eps = eps + + self._decay_method = decay + + self._action_dim = action_dim + self._action_low = action_low + self._action_high = action_high + + self._mu = np.array(mu) + self._theta = np.array(theta) + self._sigma = np.array(sigma) + + self.state = np.ones(self._action_dim) * self._mu + + def _evolve_state(self): + x = self.state + dx = self._theta * (self._mu - x)\ + + self._sigma * np.random.randn(len(x)) + self.state = x + dx + return self.state + + def reset(self): + self.state = np.ones(self._action_dim) * self._mu + + def select_action(self, values): + noise = self._evolve_state() + action = values + (self.cur_eps * noise) + return np.clip(action, self._action_low, self._action_high) diff --git a/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/util.py b/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/util.py index 5857893..a1da1ce 100644 --- a/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/util.py +++ b/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/util.py @@ -5,9 +5,9 @@ import matplotlib.pyplot as plt from matplotlib import style import time import os -import mxnet +import mxnet as mx from mxnet import gluon, nd - +import cnnarch_logger LOSS_FUNCTIONS = { 'l1': gluon.loss.L1Loss(), @@ -16,17 +16,46 @@ LOSS_FUNCTIONS = { 'softmax_cross_entropy': gluon.loss.SoftmaxCrossEntropyLoss(), 'sigmoid_cross_entropy': gluon.loss.SigmoidBinaryCrossEntropyLoss()} -def copy_net(net, input_state_dim, ctx, tmp_filename='tmp.params'): + +def make_directory_if_not_exist(directory): + if not os.path.exists(directory): + os.makedirs(directory) + + +def copy_net(net, input_state_dim, ctx): + assert isinstance(net, gluon.HybridBlock) + assert type(net.__class__) is type + + net2 = net.__class__() + net2.collect_params().initialize(mx.init.Zero(), ctx=ctx) + net2.hybridize() + net2(mx.nd.ones((1,) + input_state_dim, ctx=ctx)) + + params_of_net = [p.data() for _, p in net.collect_params().items()] + for i, (_, p) in enumerate(net2.collect_params().items()): + p.set_data(params_of_net[i]) + + return net2 + + +def copy_net_with_two_inputs(net, input_state_dim1, input_state_dim2, ctx): assert isinstance(net, gluon.HybridBlock) assert type(net.__class__) is type - net.save_parameters(tmp_filename) + net2 = net.__class__() - net2.load_parameters(tmp_filename, ctx=ctx) - os.remove(tmp_filename) + net2.collect_params().initialize(mx.init.Zero(), ctx=ctx) net2.hybridize() - net2(nd.ones((1,) + input_state_dim, ctx=ctx)) + net2( + nd.ones((1,) + input_state_dim1, ctx=ctx), + nd.ones((1,) + input_state_dim2, ctx=ctx)) + + params_of_net = [p.data() for _, p in net.collect_params().items()] + for i, (_, p) in enumerate(net2.collect_params().items()): + p.set_data(params_of_net[i]) + return net2 + def get_loss_function(loss_function_name): if loss_function_name not in LOSS_FUNCTIONS: raise ValueError('Loss function does not exist') @@ -52,89 +81,196 @@ class AgentSignalHandler(object): sys.exit(1) style.use('fivethirtyeight') + + class TrainingStats(object): - def __init__(self, logger, max_episodes, live_plot=True): - self.__logger = logger - self.__max_episodes = max_episodes - self.__all_avg_loss = np.zeros((max_episodes,)) - self.__all_total_rewards = np.zeros((max_episodes,)) - self.__all_eps = np.zeros((max_episodes,)) - self.__all_time = np.zeros((max_episodes,)) - self.__all_mean_reward_last_100_episodes = np.zeros((max_episodes,)) - self.__live_plot = live_plot + def __init__(self, max_episodes): + self._logger = cnnarch_logger.ArchLogger.get_logger() + self._max_episodes = max_episodes + self._all_total_rewards = np.zeros((max_episodes,)) + self._all_eps = np.zeros((max_episodes,)) + self._all_time = np.zeros((max_episodes,)) + self._all_mean_reward_last_100_episodes = np.zeros((max_episodes,)) @property def logger(self): - return self.__logger + return self._logger @logger.setter def logger(self, logger): - self.__logger = logger + self._logger = logger @logger.deleter def logger(self): - self.__logger = None - - def add_avg_loss(self, episode, avg_loss): - self.__all_avg_loss[episode] = avg_loss + self._logger = None def add_total_reward(self, episode, total_reward): - self.__all_total_rewards[episode] = total_reward + self._all_total_rewards[episode] = total_reward def add_eps(self, episode, eps): - self.__all_eps[episode] = eps + self._all_eps[episode] = eps def add_time(self, episode, time): - self.__all_time[episode] = time + self._all_time[episode] = time def add_mean_reward_last_100(self, episode, mean_reward): - self.__all_mean_reward_last_100_episodes[episode] = mean_reward + self._all_mean_reward_last_100_episodes[episode] = mean_reward + + def log_episode(self, *args): + raise NotImplementedError + + def mean_of_reward(self, cur_episode, last=100): + if cur_episode > 0: + reward_last_100 =\ + self._all_total_rewards[max(0, cur_episode-last):cur_episode] + return np.mean(reward_last_100) + else: + return self._all_total_rewards[0] - def log_episode(self, episode, start_time, training_steps, loss, eps, reward): + def save(self, path): + np.save(os.path.join(path, 'total_rewards'), self._all_total_rewards) + np.save(os.path.join(path, 'eps'), self._all_eps) + np.save(os.path.join(path, 'time'), self._all_time) + np.save( + os.path.join(path, 'mean_reward'), + self._all_mean_reward_last_100_episodes) + + def _log_episode(self, episode, start_time, training_steps, eps, reward): self.add_eps(episode, eps) self.add_total_reward(episode, reward) end = time.time() - if training_steps == 0: - avg_loss = 0 - else: - avg_loss = float(loss)/float(training_steps) - mean_reward_last_100 = self.mean_of_reward(episode, last=100) - time_elapsed = end - start_time - info = "Episode: %d, Total Reward: %.3f, Avg. Reward Last 100 Episodes: %.3f, Avg Loss: %.3f, Time: %.3f, Training Steps: %d, Eps: %.3f"\ - % (episode, reward, mean_reward_last_100, avg_loss, time_elapsed, training_steps, eps) - self.__logger.info(info) - self.add_avg_loss(episode, avg_loss) self.add_time(episode, time_elapsed) self.add_mean_reward_last_100(episode, mean_reward_last_100) + return ('Episode: %d, Total Reward: %.3f, ' + 'Avg. Reward Last 100 Episodes: %.3f, {}, ' + 'Time: %.3f, Training Steps: %d, Eps: %.3f') % ( + episode, reward, mean_reward_last_100, time_elapsed, + training_steps, eps), mean_reward_last_100 - return avg_loss, time_elapsed, mean_reward_last_100 - def mean_of_reward(self, cur_episode, last=100): - if cur_episode > 0: - reward_last_100 = self.__all_total_rewards[max(0, cur_episode-last):cur_episode] - return np.mean(reward_last_100) - else: - return self.__all_total_rewards[0] +class DqnTrainingStats(TrainingStats): + def __init__(self, max_episodes): + super(DqnTrainingStats, self).__init__(max_episodes) + self._all_avg_loss = np.zeros((max_episodes,)) + + def add_avg_loss(self, episode, avg_loss): + self._all_avg_loss[episode] = avg_loss + + def log_episode( + self, episode, start_time, training_steps, avg_loss, eps, reward + ): + self.add_avg_loss(episode, avg_loss) + + info, avg_reward = self._log_episode( + episode, start_time, training_steps, eps, reward) + info = info.format(('Avg. Loss: %.3f') % (avg_loss)) - def save_stats(self, file): - fig = plt.figure(figsize=(20,20)) + self._logger.info(info) + return avg_reward + + def save_stats(self, path): + fig = plt.figure(figsize=(20, 20)) sub_rewards = fig.add_subplot(221) sub_rewards.set_title('Total Rewards per episode') - sub_rewards.plot(np.arange(self.__max_episodes), self.__all_total_rewards) + sub_rewards.plot( + np.arange(self._max_episodes), self._all_total_rewards) sub_loss = fig.add_subplot(222) sub_loss.set_title('Avg. Loss per episode') - sub_loss.plot(np.arange(self.__max_episodes), self.__all_avg_loss) + sub_loss.plot(np.arange(self._max_episodes), self._all_avg_loss) sub_eps = fig.add_subplot(223) sub_eps.set_title('Epsilon per episode') - sub_eps.plot(np.arange(self.__max_episodes), self.__all_eps) + sub_eps.plot(np.arange(self._max_episodes), self._all_eps) sub_rewards = fig.add_subplot(224) sub_rewards.set_title('Avg. mean reward of last 100 episodes') - sub_rewards.plot(np.arange(self.__max_episodes), self.__all_mean_reward_last_100_episodes) + sub_rewards.plot(np.arange(self._max_episodes), + self._all_mean_reward_last_100_episodes) + + self.save(path) + plt.savefig(os.path.join(path, 'stats.pdf')) + + def save(self, path): + super(DqnTrainingStats, self).save(path) + np.save(os.path.join(path, 'avg_loss'), self._all_avg_loss) + + +class DdpgTrainingStats(TrainingStats): + def __init__(self, max_episodes): + super(DdpgTrainingStats, self).__init__(max_episodes) + self._all_avg_critic_loss = np.zeros((max_episodes,)) + self._all_avg_actor_loss = np.zeros((max_episodes,)) + self._all_avg_qvalues = np.zeros((max_episodes,)) + + def add_avg_critic_loss(self, episode, avg_critic_loss): + self._all_avg_critic_loss[episode] = avg_critic_loss + + def add_avg_actor_loss(self, episode, avg_actor_loss): + self._all_avg_actor_loss[episode] = avg_actor_loss + + def add_avg_qvalues(self, episode, avg_qvalues): + self._all_avg_qvalues[episode] = avg_qvalues + + def log_episode( + self, episode, start_time, training_steps, actor_loss, + critic_loss, qvalues, eps, reward + ): + self.add_avg_actor_loss(episode, actor_loss) + self.add_avg_critic_loss(episode, critic_loss) + self.add_avg_qvalues(episode, qvalues) + + info, avg_reward = self._log_episode( + episode, start_time, training_steps, eps, reward) + info = info.format(( + 'Avg. Actor Loss: %.3f ' + 'Avg. Critic Loss: %.3f ' + 'Avg. Q-Values: %.3f') % (actor_loss, critic_loss, qvalues)) + + self.logger.info(info) + return avg_reward + + def save(self, path): + super(DdpgTrainingStats, self).save(path) + np.save(os.path.join( + path, 'avg_critic_loss'), self._all_avg_critic_loss) + np.save(os.path.join(path, 'avg_actor_loss'), self._all_avg_actor_loss) + np.save(os.path.join(path, 'avg_qvalues'), self._all_avg_qvalues) + + def save_stats(self, path): + fig = plt.figure(figsize=(120, 120)) + + sub_rewards = fig.add_subplot(321) + sub_rewards.set_title('Total Rewards per episode') + sub_rewards.plot( + np.arange(self._max_episodes), self._all_total_rewards) + + sub_actor_loss = fig.add_subplot(322) + sub_actor_loss.set_title('Avg. Actor Loss per episode') + sub_actor_loss.plot( + np.arange(self._max_episodes), self._all_avg_actor_loss) + + sub_critic_loss = fig.add_subplot(323) + sub_critic_loss.set_title('Avg. Critic Loss per episode') + sub_critic_loss.plot( + np.arange(self._max_episodes), self._all_avg_critic_loss) + + sub_qvalues = fig.add_subplot(324) + sub_qvalues.set_title('Avg. QValues per episode') + sub_qvalues.plot( + np.arange(self._max_episodes), self._all_avg_qvalues) + + sub_eps = fig.add_subplot(325) + sub_eps.set_title('Epsilon per episode') + sub_eps.plot(np.arange(self._max_episodes), self._all_eps) + + sub_rewards = fig.add_subplot(326) + sub_rewards.set_title('Avg. mean reward of last 100 episodes') + sub_rewards.plot(np.arange(self._max_episodes), + self._all_mean_reward_last_100_episodes) - plt.savefig(file) \ No newline at end of file + self.save(path) + plt.savefig(os.path.join(path, 'stats.pdf')) diff --git a/src/test/resources/target_code/ReinforcementConfig3/CNNTrainer_reinforcementConfig3.py b/src/test/resources/target_code/ReinforcementConfig3/CNNTrainer_reinforcementConfig3.py new file mode 100644 index 0000000..3b401ba --- /dev/null +++ b/src/test/resources/target_code/ReinforcementConfig3/CNNTrainer_reinforcementConfig3.py @@ -0,0 +1,117 @@ +from reinforcement_learning.agent import DqnAgent +from reinforcement_learning.util import AgentSignalHandler +from reinforcement_learning.cnnarch_logger import ArchLogger +import reinforcement_learning.environment +import CNNCreator_reinforcementConfig3 + +import os +import sys +import re +import time +import numpy as np +import mxnet as mx + + +def resume_session(sessions_dir): + resume_session = False + resume_directory = None + if os.path.isdir(sessions_dir): + regex = re.compile(r'\d\d\d\d-\d\d-\d\d-\d\d-\d\d') + dir_content = os.listdir(sessions_dir) + session_files = filter(regex.search, dir_content) + session_files.sort(reverse=True) + for d in session_files: + interrupted_session_dir = os.path.join(sessions_dir, d, '.interrupted_session') + if os.path.isdir(interrupted_session_dir): + resume = raw_input('Interrupted session from {} found. Do you want to resume? (y/n) '.format(d)) + if resume == 'y': + resume_session = True + resume_directory = interrupted_session_dir + break + return resume_session, resume_directory + + +if __name__ == "__main__": + agent_name = 'reinforcement_agent' + # Prepare output directory and logger + all_output_dir = os.path.join('model', agent_name) + output_directory = os.path.join( + all_output_dir, + time.strftime('%Y-%m-%d-%H-%M-%S', + time.localtime(time.time()))) + ArchLogger.set_output_directory(output_directory) + ArchLogger.set_logger_name(agent_name) + ArchLogger.set_output_level(ArchLogger.INFO) + + env_params = { + 'ros_node_name': 'reinforcementConfig3TrainerNode', + 'state_topic': '/environment/state', + 'action_topic': '/environment/action', + 'reset_topic': '/environment/reset', + 'reward_topic': '/environment/reward', + } + env = reinforcement_learning.environment.RosEnvironment(**env_params) + + context = mx.cpu() + qnet_creator = CNNCreator_reinforcementConfig3.CNNCreator_reinforcementConfig3() + qnet_creator.construct(context) + + agent_params = { + 'environment': env, + 'replay_memory_params': { + 'method': 'buffer', + 'memory_size': 1000000, + 'sample_size': 64, + 'state_dtype': 'float32', + 'action_dtype': 'float32', + 'rewards_dtype': 'float32' + }, + 'strategy_params': { + 'method':'epsgreedy', + 'epsilon': 1, + 'min_epsilon': 0.02, + 'epsilon_decay_method': 'linear', + 'epsilon_decay': 0.0001, + }, + 'agent_name': agent_name, + 'verbose': True, + 'output_directory': output_directory, + 'state_dim': (8,), + 'action_dim': (3,), + 'discount_factor': 0.99999, + 'training_episodes': 1000, + 'train_interval': 1, + 'snapshot_interval': 500, + 'max_episode_step': 10000, + 'target_score': 35000, + 'qnet':qnet_creator.net, + 'use_fix_target': True, + 'target_update_interval': 500, + 'loss_function': 'huber_loss', + 'optimizer': 'adam', + 'optimizer_params': { + 'learning_rate': 0.001 }, + 'double_dqn': True, + } + + resume, resume_directory = resume_session(all_output_dir) + + if resume: + output_directory, _ = os.path.split(resume_directory) + ArchLogger.set_output_directory(output_directory) + resume_agent_params = { + 'session_dir': resume_directory, + 'environment': env, + 'net': qnet_creator.net, + } + agent = DqnAgent.resume_from_session(**resume_agent_params) + else: + agent = DqnAgent(**agent_params) + + signal_handler = AgentSignalHandler() + signal_handler.register_agent(agent) + + train_successful = agent.train() + + if train_successful: + agent.save_best_network(qnet_creator._model_dir_ + qnet_creator._model_prefix_ + '_newest', epoch=0) diff --git a/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/__init__.py b/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/agent.py b/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/agent.py new file mode 100644 index 0000000..c06b32a --- /dev/null +++ b/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/agent.py @@ -0,0 +1,900 @@ +import mxnet as mx +import numpy as np +import time +import os +import sys +import util +import matplotlib.pyplot as plt +import pyprind +from cnnarch_logger import ArchLogger +from replay_memory import ReplayMemoryBuilder +from strategy import StrategyBuilder +from util import copy_net, get_loss_function,\ + copy_net_with_two_inputs, DdpgTrainingStats, DqnTrainingStats,\ + make_directory_if_not_exist +from mxnet import nd, gluon, autograd + + +class Agent(object): + def __init__( + self, + environment, + replay_memory_params, + strategy_params, + state_dim, + action_dim, + ctx=None, + discount_factor=.9, + training_episodes=50, + train_interval=1, + start_training=0, + snapshot_interval=200, + agent_name='Agent', + max_episode_step=99999, + evaluation_samples=1000, + output_directory='model_parameters', + verbose=True, + target_score=None + ): + assert 0 < discount_factor <= 1,\ + 'Discount factor must be between 0 and 1' + assert train_interval > 0, 'Train interval must be greater 0' + assert snapshot_interval > 0, 'Snapshot interval must be greater 0' + assert max_episode_step > 0,\ + 'Maximal steps per episode must be greater 0' + assert training_episodes > 0, 'Trainings episode must be greater 0' + assert replay_memory_params is not None,\ + 'Replay memory parameter not set' + assert type(state_dim) is tuple, 'State dimension is not a tuple' + assert type(action_dim) is tuple, 'Action dimension is not a tuple' + + self._logger = ArchLogger.get_logger() + self._ctx = mx.gpu() if ctx == 'gpu' else mx.cpu() + self._environment = environment + self._discount_factor = discount_factor + self._training_episodes = training_episodes + self._train_interval = train_interval + self._verbose = verbose + self._state_dim = state_dim + + replay_memory_params['state_dim'] = state_dim + replay_memory_params['action_dim'] = action_dim + self._replay_memory_params = replay_memory_params + rm_builder = ReplayMemoryBuilder() + self._memory = rm_builder.build_by_params(**replay_memory_params) + self._minibatch_size = self._memory.sample_size + self._action_dim = action_dim + + strategy_params['action_dim'] = self._action_dim + self._strategy_params = strategy_params + strategy_builder = StrategyBuilder() + self._strategy = strategy_builder.build_by_params(**strategy_params) + self._agent_name = agent_name + self._snapshot_interval = snapshot_interval + self._creation_time = time.time() + self._max_episode_step = max_episode_step + self._start_training = start_training + self._output_directory = output_directory + self._target_score = target_score + + self._evaluation_samples = evaluation_samples + self._best_avg_score = -np.infty + self._best_net = None + + self._interrupt_flag = False + self._training_stats = None + + # Training Context + self._current_episode = 0 + self._total_steps = 0 + + @property + def current_episode(self): + return self._current_episode + + @property + def environment(self): + return self._environment + + def save_config_file(self): + import json + make_directory_if_not_exist(self._output_directory) + filename = os.path.join(self._output_directory, 'config.json') + config = self._make_config_dict() + with open(filename, mode='w') as fp: + json.dump(config, fp, indent=4) + + def set_interrupt_flag(self, interrupt): + self._interrupt_flag = interrupt + + def _interrupt_training(self): + import pickle + self._logger.info('Training interrupted; Store state for resuming') + session_dir = self._get_session_dir() + agent_session_file = os.path.join(session_dir, 'agent.p') + logger = self._logger + + self._make_pickle_ready(session_dir) + + with open(agent_session_file, 'wb') as f: + pickle.dump(self, f, protocol=2) + logger.info('State successfully stored') + + def _make_pickle_ready(self, session_dir): + del self._training_stats.logger + self._logger = None + self._environment.close() + self._environment = None + self._save_net(self._best_net, 'best_net', session_dir) + self._best_net = None + + def _make_config_dict(self): + config = dict() + config['state_dim'] = self._state_dim + config['action_dim'] = self._action_dim + config['ctx'] = str(self._ctx) + config['discount_factor'] = self._discount_factor + config['strategy_params'] = self._strategy_params + config['replay_memory_params'] = self._replay_memory_params + config['training_episodes'] = self._training_episodes + config['start_training'] = self._start_training + config['evaluation_samples'] = self._evaluation_samples + config['train_interval'] = self._train_interval + config['snapshot_interval'] = self._snapshot_interval + config['agent_name'] = self._agent_name + config['max_episode_step'] = self._max_episode_step + config['output_directory'] = self._output_directory + config['verbose'] = self._verbose + config['target_score'] = self._target_score + return config + + def _adjust_optimizer_params(self, optimizer_params): + if 'weight_decay' in optimizer_params: + optimizer_params['wd'] = optimizer_params['weight_decay'] + del optimizer_params['weight_decay'] + if 'learning_rate_decay' in optimizer_params: + min_learning_rate = 1e-8 + if 'learning_rate_minimum' in optimizer_params: + min_learning_rate = optimizer_params['learning_rate_minimum'] + del optimizer_params['learning_rate_minimum'] + optimizer_params['lr_scheduler'] = mx.lr_scheduler.FactorScheduler( + optimizer_params['step_size'], + factor=optimizer_params['learning_rate_decay'], + stop_factor_lr=min_learning_rate) + del optimizer_params['step_size'] + del optimizer_params['learning_rate_decay'] + + return optimizer_params + + def _sample_from_memory(self): + states, actions, rewards, next_states, terminals\ + = self._memory.sample(batch_size=self._minibatch_size) + states = nd.array(states, ctx=self._ctx) + actions = nd.array(actions, ctx=self._ctx) + rewards = nd.array(rewards, ctx=self._ctx) + next_states = nd.array(next_states, ctx=self._ctx) + terminals = nd.array(terminals, ctx=self._ctx) + return states, actions, rewards, next_states, terminals + + def evaluate(self, target=None, sample_games=100, verbose=True): + target = self._target_score if target is None else target + if target: + target_achieved = 0 + total_reward = 0 + + self._logger.info('Sampling from {} games...'.format(sample_games)) + for g in pyprind.prog_bar(range(sample_games)): + state = self._environment.reset() + step = 0 + game_reward = 0 + terminal = False + while not terminal and (step < self._max_episode_step): + action = self.get_next_action(state) + state, reward, terminal, _ = self._environment.step(action) + game_reward += reward + step += 1 + + if verbose: + info = 'Game %d: Reward %f' % (g, game_reward) + self._logger.debug(info) + if target: + if game_reward >= target: + target_achieved += 1 + total_reward += game_reward + + avg_reward = float(total_reward)/float(sample_games) + info = 'Avg. Reward: %f' % avg_reward + if target: + target_achieved_ratio = int( + (float(target_achieved)/float(sample_games))*100) + info += '; Target Achieved in %d%% of games'\ + % (target_achieved_ratio) + + if verbose: + self._logger.info(info) + return avg_reward + + def _do_snapshot_if_in_interval(self, episode): + do_snapshot =\ + (episode != 0 and (episode % self._snapshot_interval == 0)) + if do_snapshot: + self.save_parameters(episode=episode) + self._evaluate() + + def _evaluate(self, verbose=True): + avg_reward = self.evaluate( + sample_games=self._evaluation_samples, verbose=False) + info = 'Evaluation -> Average Reward in {} games: {}'.format( + self._evaluation_samples, avg_reward) + + if self._best_avg_score is None or self._best_avg_score <= avg_reward: + self._save_current_as_best_net() + self._best_avg_score = avg_reward + if verbose: + self._logger.info(info) + + def _is_target_reached(self, avg_reward): + return self._target_score is not None\ + and avg_reward > self._target_score + + def _do_training(self): + return (self._total_steps % self._train_interval == 0) and\ + (self._memory.is_sample_possible(self._minibatch_size)) and\ + (self._current_episode >= self._start_training) + + def _check_interrupt_routine(self): + if self._interrupt_flag: + self._interrupt_flag = False + self._interrupt_training() + return True + return False + + def _is_target_reached(self, avg_reward): + return self._target_score is not None\ + and avg_reward > self._target_score + + def _save_parameters(self, net, episode=None, filename='dqn-agent-params'): + assert self._output_directory + assert isinstance(net, gluon.HybridBlock) + make_directory_if_not_exist(self._output_directory) + + if(episode is not None): + self._logger.info( + 'Saving model parameters after episode %d' % episode) + filename = filename + '-ep{}'.format(episode) + else: + self._logger.info('Saving model parameters') + self._save_net(net, filename) + + def _save_net(self, net, filename, filedir=None): + filedir = self._output_directory if filedir is None else filedir + filename = os.path.join(filedir, filename + '.params') + net.save_parameters(filename) + + def save_best_network(self, path, epoch=0): + self._logger.info( + 'Saving best network with average reward of {}'.format( + self._best_avg_score)) + self._best_net.export(path, epoch=epoch) + + def _get_session_dir(self): + session_dir = os.path.join( + self._output_directory, '.interrupted_session') + make_directory_if_not_exist(session_dir) + return session_dir + + def _save_current_as_best_net(self): + raise NotImplementedError + + def get_next_action(self, state): + raise NotImplementedError + + def save_parameters(self, episode): + raise NotImplementedError + + def train(self, episodes=None): + raise NotImplementedError + + +class DdpgAgent(Agent): + def __init__( + self, + actor, + critic, + environment, + replay_memory_params, + strategy_params, + state_dim, + action_dim, + soft_target_update_rate=.001, + actor_optimizer='adam', + actor_optimizer_params={'learning_rate': 0.0001}, + critic_optimizer='adam', + critic_optimizer_params={'learning_rate': 0.001}, + ctx=None, + discount_factor=.9, + training_episodes=50, + start_training=20, + train_interval=1, + snapshot_interval=200, + agent_name='DdpgAgent', + max_episode_step=9999, + evaluation_samples=100, + output_directory='model_parameters', + verbose=True, + target_score=None + ): + super(DdpgAgent, self).__init__( + environment=environment, replay_memory_params=replay_memory_params, + strategy_params=strategy_params, state_dim=state_dim, + action_dim=action_dim, ctx=ctx, discount_factor=discount_factor, + training_episodes=training_episodes, start_training=start_training, + train_interval=train_interval, + snapshot_interval=snapshot_interval, agent_name=agent_name, + max_episode_step=max_episode_step, + output_directory=output_directory, verbose=verbose, + target_score=target_score, evaluation_samples=evaluation_samples) + assert critic is not None, 'Critic not set' + assert actor is not None, 'Actor is not set' + assert soft_target_update_rate > 0,\ + 'Target update must be greater zero' + assert actor_optimizer is not None, 'No actor optimizer set' + assert critic_optimizer is not None, 'No critic optimizer set' + + self._actor = actor + self._critic = critic + + self._actor_target = self._copy_actor() + self._critic_target = self._copy_critic() + + self._actor_optimizer = actor_optimizer + self._actor_optimizer_params = self._adjust_optimizer_params( + actor_optimizer_params) + + self._critic_optimizer = critic_optimizer + self._critic_optimizer_params = self._adjust_optimizer_params( + critic_optimizer_params) + + self._soft_target_update_rate = soft_target_update_rate + + self._logger.info( + 'Agent created with following parameters: {}'.format( + self._make_config_dict())) + + self._best_net = self._copy_actor() + + self._training_stats = DdpgTrainingStats(self._training_episodes) + + def _make_pickle_ready(self, session_dir): + super(DdpgAgent, self)._make_pickle_ready(session_dir) + self._save_net(self._actor, 'actor', session_dir) + self._actor = None + self._save_net(self._critic, 'critic', session_dir) + self._critic = None + self._save_net(self._actor_target, 'actor_target', session_dir) + self._actor_target = None + self._save_net(self._critic_target, 'critic_target', session_dir) + self._critic_target = None + + @classmethod + def resume_from_session(cls, session_dir, actor, critic, environment): + import pickle + if not os.path.exists(session_dir): + raise ValueError('Session directory does not exist') + + files = dict() + files['agent'] = os.path.join(session_dir, 'agent.p') + files['best_net_params'] = os.path.join(session_dir, 'best_net.params') + files['actor_net_params'] = os.path.join(session_dir, 'actor.params') + files['actor_target_net_params'] = os.path.join( + session_dir, 'actor_target.params') + files['critic_net_params'] = os.path.join(session_dir, 'critic.params') + files['critic_target_net_params'] = os.path.join( + session_dir, 'critic_target.params') + + for file in files.values(): + if not os.path.exists(file): + raise ValueError( + 'Session directory is not complete: {} is missing' + .format(file)) + + with open(files['agent'], 'rb') as f: + agent = pickle.load(f) + + agent._environment = environment + + agent._actor = actor + agent._actor.load_parameters(files['actor_net_params'], agent._ctx) + agent._actor.hybridize() + agent._actor(nd.random_normal( + shape=((1,) + agent._state_dim), ctx=agent._ctx)) + + agent._best_net = copy_net(agent._actor, agent._state_dim, agent._ctx) + agent._best_net.load_parameters(files['best_net_params'], agent._ctx) + + agent._actor_target = copy_net( + agent._actor, agent._state_dim, agent._ctx) + agent._actor_target.load_parameters(files['actor_target_net_params']) + + agent._critic = critic + agent._critic.load_parameters(files['critic_net_params'], agent._ctx) + agent._critic.hybridize() + agent._critic( + nd.random_normal(shape=((1,) + agent._state_dim), ctx=agent._ctx), + nd.random_normal(shape=((1,) + agent._action_dim), ctx=agent._ctx)) + + agent._critic_target = copy_net_with_two_inputs( + agent._critic, agent._state_dim, agent._action_dim, agent._ctx) + agent._critic_target.load_parameters(files['critic_target_net_params']) + + agent._logger = ArchLogger.get_logger() + agent._training_stats.logger = ArchLogger.get_logger() + agent._logger.info('Agent was retrieved; Training can be continued') + + return agent + + def _save_current_as_best_net(self): + self._best_net = self._copy_actor() + + def get_next_action(self, state): + action = self._actor(nd.array([state], ctx=self._ctx)) + return action[0].asnumpy() + + def save_parameters(self, episode): + self._save_parameters(self._actor, episode=episode) + + def train(self, episodes=None): + self.save_config_file() + self._logger.info("--- Start DDPG training ---") + episodes = \ + episodes if episodes is not None else self._training_episodes + + resume = (self._current_episode > 0) + if resume: + self._logger.info("Training session resumed") + self._logger.info( + "Starting from episode {}".format(self._current_episode)) + else: + self._training_stats = DdpgTrainingStats(episodes) + + # Initialize target Q' and mu' + self._actor_target = self._copy_actor() + self._critic_target = self._copy_critic() + + # Initialize l2 loss for critic network + l2_loss = gluon.loss.L2Loss() + + # Initialize critic and actor trainer + trainer_actor = gluon.Trainer( + self._actor.collect_params(), self._actor_optimizer, + self._actor_optimizer_params) + trainer_critic = gluon.Trainer( + self._critic.collect_params(), self._critic_optimizer, + self._critic_optimizer_params) + + # For episode=1..n + while self._current_episode < episodes: + # Check interrupt flag + if self._check_interrupt_routine(): + return False + + # Initialize new episode + step = 0 + episode_reward = 0 + start = time.time() + episode_critic_loss = 0 + episode_actor_loss = 0 + episode_avg_q_value = 0 + training_steps = 0 + + # Get initialial observation state s + state = self._environment.reset() + + # For step=1..T + while step < self._max_episode_step: + # Select an action a = mu(s) + N(step) according to current + # actor and exploration noise N according to strategy + action = self._strategy.select_action( + self.get_next_action(state)) + + # Execute action a and observe reward r and next state ns + next_state, reward, terminal, _ = \ + self._environment.step(action) + + self._logger.debug( + 'Applied action {} with reward {}'.format(action, reward)) + + # Store transition (s,a,r,ns) in replay buffer + self._memory.append( + state, action, reward, next_state, terminal) + + if self._do_training(): + # Sample random minibatch of b transitions + # (s_i, a_i, r_i, s_(i+1)) from replay buffer + states, actions, rewards, next_states, terminals =\ + self._sample_from_memory() + + actor_target_actions = self._actor_target(next_states) + critic_target_qvalues = self._critic_target( + next_states, actor_target_actions) + + rewards = rewards.reshape(self._minibatch_size, 1) + terminals = terminals.reshape(self._minibatch_size, 1) + + # y = r_i + discount * Q'(s_(i+1), mu'(s_(i+1))) + y = rewards + (1.0 - terminals) * self._discount_factor\ + * critic_target_qvalues + + # Train the critic network + with autograd.record(): + qvalues = self._critic(states, actions) + critic_loss = l2_loss(qvalues, y) + critic_loss.backward() + trainer_critic.step(self._minibatch_size) + + # Train the actor network + # Temporary critic so that gluon trainer does not mess + # with critic parameters + tmp_critic = self._copy_critic() + with autograd.record(): + actor_qvalues = tmp_critic(states, self._actor(states)) + # For maximizing qvalues we have to multiply with -1 + # as we use a minimizer + actor_loss = -1 * actor_qvalues + actor_loss.backward() + trainer_actor.step(self._minibatch_size) + + # Update target networks: + self._actor_target = self._soft_update( + self._actor, self._actor_target, + self._soft_target_update_rate) + self._critic_target = self._soft_update( + self._critic, self._critic_target, + self._soft_target_update_rate) + + # Update statistics + episode_critic_loss +=\ + np.sum(critic_loss.asnumpy()) / self._minibatch_size + episode_actor_loss +=\ + np.sum(actor_loss.asnumpy()) / self._minibatch_size + episode_avg_q_value +=\ + np.sum(actor_qvalues.asnumpy()) / self._minibatch_size + + training_steps += 1 + + episode_reward += reward + step += 1 + self._total_steps += 1 + state = next_state + + if terminal: + # Reset the strategy + self._strategy.reset() + break + + # Log the episode results + episode_actor_loss = 0 if training_steps == 0\ + else (episode_actor_loss / training_steps) + episode_critic_loss = 0 if training_steps == 0\ + else (episode_critic_loss / training_steps) + episode_avg_q_value = 0 if training_steps == 0\ + else (episode_avg_q_value / training_steps) + + avg_reward = self._training_stats.log_episode( + self._current_episode, start, training_steps, + episode_actor_loss, episode_critic_loss, episode_avg_q_value, + self._strategy.cur_eps, episode_reward) + + self._do_snapshot_if_in_interval(self._current_episode) + self._strategy.decay(self._current_episode) + + if self._is_target_reached(avg_reward): + self._logger.info( + 'Target score is reached in average; Training is stopped') + break + + self._current_episode += 1 + + self._evaluate() + self.save_parameters(episode=self._current_episode) + self.save_best_network(os.path.join(self._output_directory, 'best')) + self._training_stats.save_stats(self._output_directory) + self._logger.info('--------- Training finished ---------') + return True + + def _make_config_dict(self): + config = super(DdpgAgent, self)._make_config_dict() + config['soft_target_update_rate'] = self._soft_target_update_rate + config['actor_optimizer'] = self._actor_optimizer + config['actor_optimizer_params'] = self._actor_optimizer_params + config['critic_optimizer'] = self._critic_optimizer + config['critic_optimizer_params'] = self._critic_optimizer_params + return config + + def _soft_update(self, net, target, tau): + net_params = [p.data() for _, p in net.collect_params().items()] + for i, (_, p) in enumerate(target.collect_params().items()): + target_params = p.data() + p.set_data((1.0 - tau) * target_params + tau * net_params[i]) + return target + + def _copy_actor(self): + assert self._actor is not None + assert self._ctx is not None + assert type(self._state_dim) is tuple + return copy_net(self._actor, self._state_dim, ctx=self._ctx) + + def _copy_critic(self): + assert self._critic is not None + assert self._ctx is not None + assert type(self._state_dim) is tuple + assert type(self._action_dim) is tuple + return copy_net_with_two_inputs( + self._critic, self._state_dim, self._action_dim, ctx=self._ctx) + + +class DqnAgent(Agent): + def __init__( + self, + qnet, + environment, + replay_memory_params, + strategy_params, + state_dim, + action_dim, + ctx=None, + discount_factor=.9, + loss_function='euclidean', + optimizer='rmsprop', + optimizer_params={'learning_rate': 0.09}, + training_episodes=50, + start_training=0, + train_interval=1, + use_fix_target=False, + double_dqn=False, + target_update_interval=10, + snapshot_interval=200, + evaluation_samples=100, + agent_name='Dqn_agent', + max_episode_step=99999, + output_directory='model_parameters', + verbose=True, + target_score=None + ): + super(DqnAgent, self).__init__( + environment=environment, replay_memory_params=replay_memory_params, + strategy_params=strategy_params, state_dim=state_dim, + action_dim=action_dim, ctx=ctx, discount_factor=discount_factor, + training_episodes=training_episodes, start_training=start_training, + train_interval=train_interval, + snapshot_interval=snapshot_interval, agent_name=agent_name, + max_episode_step=max_episode_step, + output_directory=output_directory, verbose=verbose, + target_score=target_score, evaluation_samples=evaluation_samples) + + self._qnet = qnet + self._target_update_interval = target_update_interval + self._target_qnet = copy_net( + self._qnet, self._state_dim, ctx=self._ctx) + self._loss_function_str = loss_function + self._loss_function = get_loss_function(loss_function) + self._optimizer = optimizer + self._optimizer_params = optimizer_params + self._double_dqn = double_dqn + self._use_fix_target = use_fix_target + + # Initialize best network + self._best_net = copy_net(self._qnet, self._state_dim, self._ctx) + self._best_avg_score = -np.infty + + self._training_stats = None + + @classmethod + def resume_from_session(cls, session_dir, net, environment): + import pickle + if not os.path.exists(session_dir): + raise ValueError('Session directory does not exist') + + files = dict() + files['agent'] = os.path.join(session_dir, 'agent.p') + files['best_net_params'] = os.path.join(session_dir, 'best_net.params') + files['q_net_params'] = os.path.join(session_dir, 'qnet.params') + files['target_net_params'] = os.path.join( + session_dir, 'target_net.params') + + for file in files.values(): + if not os.path.exists(file): + raise ValueError( + 'Session directory is not complete: {} is missing' + .format(file)) + + with open(files['agent'], 'rb') as f: + agent = pickle.load(f) + + agent._environment = environment + agent._qnet = net + agent._qnet.load_parameters(files['q_net_params'], agent._ctx) + agent._qnet.hybridize() + agent._qnet(nd.random_normal( + shape=((1,) + agent._state_dim), ctx=agent._ctx)) + agent._best_net = copy_net(agent._qnet, agent._state_dim, agent._ctx) + agent._best_net.load_parameters(files['best_net_params'], agent._ctx) + agent._target_qnet = copy_net( + agent._qnet, agent._state_dim, agent._ctx) + agent._target_qnet.load_parameters( + files['target_net_params'], agent._ctx) + + agent._logger = ArchLogger.get_logger() + agent._training_stats.logger = agent._logger + agent._logger.info('Agent was retrieved; Training can be continued') + + return agent + + def _make_pickle_ready(self, session_dir): + super(DqnAgent, self)._make_pickle_ready(session_dir) + self._save_net(self._qnet, 'qnet', session_dir) + self._qnet = None + self._save_net(self._target_qnet, 'target_net', session_dir) + self._target_qnet = None + + def get_q_values(self, state, with_best=False): + return self.get_batch_q_values( + nd.array([state], ctx=self._ctx), with_best=with_best)[0] + + def get_batch_q_values(self, state_batch, with_best=False): + return self._best_net(state_batch)\ + if with_best else self._qnet(state_batch) + + def get_next_action(self, state, with_best=False): + q_values = self.get_q_values(state, with_best=with_best) + action = q_values.asnumpy().argmax() + return action + + def __determine_target_q_values( + self, states, actions, rewards, next_states, terminals + ): + if self._use_fix_target: + q_max_val = self._target_qnet(next_states) + else: + q_max_val = self._qnet(next_states) + + if self._double_dqn: + q_values_next_states = self._qnet(next_states) + target_rewards = rewards + nd.choose_element_0index( + q_max_val, nd.argmax_channel(q_values_next_states))\ + * (1.0 - terminals) * self._discount_factor + else: + target_rewards = rewards + nd.choose_element_0index( + q_max_val, nd.argmax_channel(q_max_val))\ + * (1.0 - terminals) * self._discount_factor + + target_qval = self._qnet(states) + for t in range(target_rewards.shape[0]): + target_qval[t][actions[t]] = target_rewards[t] + + return target_qval + + def __train_q_net_step(self, trainer): + states, actions, rewards, next_states, terminals =\ + self._sample_from_memory() + target_qval = self.__determine_target_q_values( + states, actions, rewards, next_states, terminals) + with autograd.record(): + q_values = self._qnet(states) + loss = self._loss_function(q_values, target_qval) + loss.backward() + trainer.step(self._minibatch_size) + return loss + + def __do_target_update_if_in_interval(self, total_steps): + do_target_update = ( + self._use_fix_target and + (total_steps % self._target_update_interval == 0)) + if do_target_update: + self._logger.info( + 'Target network is updated after {} steps'.format(total_steps)) + self._target_qnet = copy_net( + self._qnet, self._state_dim, self._ctx) + + def train(self, episodes=None): + self.save_config_file() + self._logger.info("--- Start training ---") + trainer = gluon.Trainer( + self._qnet.collect_params(), + self._optimizer, + self._adjust_optimizer_params(self._optimizer_params)) + episodes = episodes if episodes is not None\ + else self._training_episodes + + resume = (self._current_episode > 0) + if resume: + self._logger.info("Training session resumed") + self._logger.info("Starting from episode {}".format( + self._current_episode)) + else: + self._training_stats = DqnTrainingStats(episodes) + + # Implementation Deep Q Learning described by + # Mnih et. al. in Playing Atari with Deep Reinforcement Learning + while self._current_episode < episodes: + if self._check_interrupt_routine(): + return False + + step = 0 + episode_reward = 0 + start = time.time() + state = self._environment.reset() + episode_loss = 0 + training_steps = 0 + while step < self._max_episode_step: + # 1. Choose an action based on current game state and policy + q_values = self._qnet(nd.array([state], ctx=self._ctx)) + action = self._strategy.select_action(q_values[0]) + + # 2. Play the game for a single step + next_state, reward, terminal, _ =\ + self._environment.step(action) + + # 3. Store transition in replay memory + self._memory.append( + state, action, reward, next_state, terminal) + + # 4. Train the network if in interval + if self._do_training(): + loss = self.__train_q_net_step(trainer) + training_steps += 1 + episode_loss +=\ + np.sum(loss.asnumpy()) / self._minibatch_size + + # Update target network if in interval + self.__do_target_update_if_in_interval(self._total_steps) + + step += 1 + self._total_steps += 1 + episode_reward += reward + state = next_state + + if terminal: + self._strategy.reset() + break + + self._do_snapshot_if_in_interval(self._current_episode) + + episode_loss = (episode_loss / training_steps)\ + if training_steps > 0 else 0 + avg_reward = self._training_stats.log_episode( + self._current_episode, start, training_steps, + episode_loss, self._strategy.cur_eps, episode_reward) + + self._strategy.decay(self._current_episode) + + if self._is_target_reached(avg_reward): + self._logger.info( + 'Target score is reached in average; Training is stopped') + break + + self._current_episode += 1 + + self._evaluate() + self.save_parameters(episode=self._current_episode) + self.save_best_network(os.path.join(self._output_directory, 'best')) + self._training_stats.save_stats(self._output_directory) + self._logger.info('--------- Training finished ---------') + return True + + def _make_config_dict(self): + config = super(DqnAgent, self)._make_config_dict() + config['optimizer'] = self._optimizer + config['optimizer_params'] = self._optimizer_params + config['loss_function'] = self._loss_function_str + config['use_fix_target'] = self._use_fix_target + config['double_dqn'] = self._double_dqn + config['target_update_interval'] = self._target_update_interval + return config + + def save_parameters(self, episode): + self._save_parameters(self._qnet, episode=episode) + + def _save_current_as_best_net(self): + self._best_net = copy_net( + self._qnet, (1,) + self._state_dim, ctx=self._ctx) diff --git a/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/cnnarch_logger.py b/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/cnnarch_logger.py new file mode 100644 index 0000000..19f36b1 --- /dev/null +++ b/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/cnnarch_logger.py @@ -0,0 +1,95 @@ +import logging +import sys +import os +import util + + +class ArchLogger(object): + _logger = None + + __output_level = logging.INFO + __logger_name = 'agent' + __output_directory = '.' + __append = True + __logformat = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + __dateformat = '%d-%b-%y %H:%M:%S' + + INFO = logging.INFO + DEBUG = logging.DEBUG + + @staticmethod + def set_output_level(output_level): + assert output_level is not None + ArchLogger.__output_level = output_level + + @staticmethod + def set_logger_name(logger_name): + assert logger_name is not None + ArchLogger.__logger_name = logger_name + + @staticmethod + def set_output_directory(output_directory): + assert output_directory is not None + ArchLogger.__output_directory = output_directory + + @staticmethod + def set_append(append): + assert append is not None + ArchLogger.__append = append + + @staticmethod + def set_log_format(logformat, dateformat): + assert logformat is not None + assert dateformat is not None + ArchLogger.__logformat = logformat + ArchLogger.__dateformat = dateformat + + @staticmethod + def init_logger(make_log_file=True): + assert ArchLogger._logger is None, 'Logger init already called' + filemode = 'a' if ArchLogger.__append else 'w' + formatter = logging.Formatter( + fmt=ArchLogger.__logformat, datefmt=ArchLogger.__dateformat) + + logger = logging.getLogger(ArchLogger.__logger_name) + logger.propagate = False + + if not logger.handlers: + logger.setLevel(ArchLogger.__output_level) + stream_handler = logging.StreamHandler(sys.stdout) + stream_handler.setLevel(ArchLogger.__output_level) + stream_handler.setFormatter(formatter) + logger.addHandler(stream_handler) + + if make_log_file: + util.make_directory_if_not_exist(ArchLogger.__output_directory) + log_file = os.path.join( + ArchLogger.__output_directory, + ArchLogger.__logger_name + '.log') + file_handler = logging.FileHandler(log_file, mode=filemode) + file_handler.setLevel(ArchLogger.__output_level) + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + ArchLogger._logger = logger + + @staticmethod + def get_logger(): + if ArchLogger._logger is None: + ArchLogger.init_logger() + assert ArchLogger._logger is not None + return ArchLogger._logger + +if __name__ == "__main__": + print('=== Test logger ===') + ArchLogger.set_logger_name('TestLogger') + ArchLogger.set_output_directory('test_log') + ArchLogger.init_logger() + logger = ArchLogger.get_logger() + logger.warning('This is a warning') + logger.debug('This is a debug information, which you should not see') + logger.info('This is a normal information') + assert os.path.exists('test_log')\ + and os.path.isfile(os.path.join('test_log', 'TestLogger.log')),\ + 'Test failed: No logfile exists' + import shutil + shutil.rmtree('test_log') \ No newline at end of file diff --git a/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/environment.py b/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/environment.py new file mode 100644 index 0000000..5f8b057 --- /dev/null +++ b/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/environment.py @@ -0,0 +1,134 @@ +import abc +import logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +class Environment: + __metaclass__ = abc.ABCMeta + + def __init__(self): + pass + + @abc.abstractmethod + def reset(self): + pass + + @abc.abstractmethod + def step(self, action): + pass + + @abc.abstractmethod + def close(self): + pass + +import rospy +import thread +import numpy as np +import time +from std_msgs.msg import Float32MultiArray, Bool, Int32, MultiArrayDimension, Float32 + +class RosEnvironment(Environment): + def __init__(self, + ros_node_name='RosTrainingAgent', + timeout_in_s=3, + state_topic='state', + action_topic='action', + reset_topic='reset', + terminal_state_topic='terminal', + reward_topic='reward'): + super(RosEnvironment, self).__init__() + self.__timeout_in_s = timeout_in_s + self.__waiting_for_state_update = False + self.__waiting_for_terminal_update = False + self.__last_received_state = 0 + self.__last_received_terminal = True + self.__last_received_reward = 0.0 + self.__waiting_for_reward_update = False + + rospy.loginfo("Initialize node {0}".format(ros_node_name)) + + self.__step_publisher = rospy.Publisher(action_topic, Int32, queue_size=1) + rospy.loginfo('Step Publisher initialized with topic {}'.format(action_topic)) + + self.__reset_publisher = rospy.Publisher(reset_topic, Bool, queue_size=1) + rospy.loginfo('Reset Publisher initialized with topic {}'.format(reset_topic)) + + rospy.init_node(ros_node_name, anonymous=True) + + self.__state_subscriber = rospy.Subscriber(state_topic, Float32MultiArray, self.__state_callback) + rospy.loginfo('State Subscriber registered with topic {}'.format(state_topic)) + + self.__terminal_state_subscriber = rospy.Subscriber(terminal_state_topic, Bool, self.__terminal_state_callback) + rospy.loginfo('Terminal State Subscriber registered with topic {}'.format(terminal_state_topic)) + + self.__reward_subscriber = rospy.Subscriber(reward_topic, Float32, self.__reward_callback) + rospy.loginfo('Reward Subscriber registered with topic {}'.format(reward_topic)) + + rate = rospy.Rate(10) + + thread.start_new_thread(rospy.spin, ()) + time.sleep(2) + + def reset(self): + time.sleep(0.5) + reset_message = Bool() + reset_message.data = True + self.__waiting_for_state_update = True + self.__reset_publisher.publish(reset_message) + while self.__last_received_terminal: + self.__wait_for_new_state(self.__reset_publisher, reset_message) + return self.__last_received_state + + def step(self, action): + action_rospy = Int32() + action_rospy.data = action + + logger.debug('Send action: {}'.format(action)) + + self.__waiting_for_state_update = True + self.__waiting_for_terminal_update = True + self.__waiting_for_reward_update = True + self.__step_publisher.publish(action_rospy) + self.__wait_for_new_state(self.__step_publisher, action_rospy) + next_state = self.__last_received_state + terminal = self.__last_received_terminal + reward = self.__last_received_reward + rospy.logdebug('Calculated reward: {}'.format(reward)) + + return next_state, reward, terminal, 0 + + def __wait_for_new_state(self, publisher, msg): + time_of_timeout = time.time() + self.__timeout_in_s + timeout_counter = 0 + while(self.__waiting_for_state_update + or self.__waiting_for_terminal_update or self.__waiting_for_reward_update): + is_timeout = (time.time() > time_of_timeout) + if (is_timeout): + if timeout_counter < 3: + rospy.logwarn("Timeout occured: Retry message") + publisher.publish(msg) + timeout_counter += 1 + time_of_timeout = time.time() + self.__timeout_in_s + else: + rospy.logerr("Timeout 3 times in a row: Terminate application") + exit() + time.sleep(100/1000) + + def close(self): + rospy.signal_shutdown('Program ended!') + + def __state_callback(self, data): + self.__last_received_state = np.array(data.data, dtype='float32') + rospy.logdebug('Received state: {}'.format(self.__last_received_state)) + self.__waiting_for_state_update = False + + def __terminal_state_callback(self, data): + self.__last_received_terminal = data.data + rospy.logdebug('Received terminal flag: {}'.format(self.__last_received_terminal)) + logger.debug('Received terminal: {}'.format(self.__last_received_terminal)) + self.__waiting_for_terminal_update = False + + def __reward_callback(self, data): + self.__last_received_reward = float(data.data) + logger.debug('Received reward: {}'.format(self.__last_received_reward)) + self.__waiting_for_reward_update = False diff --git a/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/replay_memory.py b/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/replay_memory.py new file mode 100644 index 0000000..d22147a --- /dev/null +++ b/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/replay_memory.py @@ -0,0 +1,208 @@ +import numpy as np + + +class ReplayMemoryBuilder(object): + def __init__(self): + self.__supported_methods = ['online', 'buffer', 'combined'] + + def build_by_params( + self, + state_dim, + method='online', + state_dtype='float32', + action_dim=(1,), + action_dtype='uint8', + rewards_dtype='float32', + memory_size=1000, + sample_size=32 + ): + assert state_dim is not None + assert action_dim is not None + assert method in self.__supported_methods + + if method == 'online': + return self.build_online_memory( + state_dim=state_dim, state_dtype=state_dtype, + action_dtype=action_dtype, action_dim=action_dim, + rewards_dtype=rewards_dtype) + else: + assert memory_size is not None and memory_size > 0 + assert sample_size is not None and sample_size > 0 + if method == 'buffer': + return self.build_buffered_memory( + state_dim=state_dim, sample_size=sample_size, + memory_size=memory_size, state_dtype=state_dtype, + action_dim=action_dim, action_dtype=action_dtype, + rewards_dtype=rewards_dtype) + else: + return self.build_combined_memory( + state_dim=state_dim, sample_size=sample_size, + memory_size=memory_size, state_dtype=state_dtype, + action_dim=action_dim, action_dtype=action_dtype, + rewards_dtype=rewards_dtype) + + def build_buffered_memory( + self, state_dim, memory_size, sample_size, state_dtype, action_dim, + action_dtype, rewards_dtype + ): + assert memory_size > 0 + assert sample_size > 0 + return ReplayMemory( + state_dim, size=memory_size, sample_size=sample_size, + state_dtype=state_dtype, action_dim=action_dim, + action_dtype=action_dtype, rewards_dtype=rewards_dtype) + + def build_combined_memory( + self, state_dim, memory_size, sample_size, state_dtype, action_dim, + action_dtype, rewards_dtype + ): + assert memory_size > 0 + assert sample_size > 0 + return CombinedReplayMemory( + state_dim, size=memory_size, sample_size=sample_size, + state_dtype=state_dtype, action_dim=action_dim, + action_dtype=action_dtype, rewards_dtype=rewards_dtype) + + def build_online_memory( + self, state_dim, state_dtype, action_dtype, action_dim, rewards_dtype + ): + return OnlineReplayMemory( + state_dim, state_dtype=state_dtype, action_dim=action_dim, + action_dtype=action_dtype, rewards_dtype=rewards_dtype) + + +class ReplayMemory(object): + def __init__( + self, + state_dim, + sample_size, + size=1000, + action_dim=(1,), + state_dtype='float32', + action_dtype='uint8', + rewards_dtype='float32' + ): + assert size > 0, "Size must be greater than zero" + assert type(state_dim) is tuple, "State dimension must be a tuple" + assert type(action_dim) is tuple, "Action dimension must be a tuple" + assert sample_size > 0 + self._size = size + self._sample_size = sample_size + self._cur_size = 0 + self._pointer = 0 + self._state_dim = state_dim + self._state_dtype = state_dtype + self._action_dim = action_dim + self._action_dtype = action_dtype + self._rewards_dtype = rewards_dtype + self._states = np.zeros((self._size,) + state_dim, dtype=state_dtype) + self._actions = np.zeros( + (self._size,) + action_dim, dtype=action_dtype) + self._rewards = np.array([0] * self._size, dtype=rewards_dtype) + self._next_states = np.zeros( + (self._size,) + state_dim, dtype=state_dtype) + self._terminals = np.array([0] * self._size, dtype='bool') + + @property + def sample_size(self): + return self._sample_size + + def append(self, state, action, reward, next_state, terminal): + self._states[self._pointer] = state + self._actions[self._pointer] = action + self._rewards[self._pointer] = reward + self._next_states[self._pointer] = next_state + self._terminals[self._pointer] = terminal + + self._pointer = self._pointer + 1 + if self._pointer == self._size: + self._pointer = 0 + + self._cur_size = min(self._size, self._cur_size + 1) + + def at(self, index): + return self._states[index],\ + self._actions[index],\ + self._rewards[index],\ + self._next_states[index],\ + self._terminals[index] + + def is_sample_possible(self, batch_size=None): + batch_size = batch_size if batch_size is not None\ + else self._sample_size + return self._cur_size >= batch_size + + def sample(self, batch_size=None): + batch_size = batch_size if batch_size is not None\ + else self._sample_size + assert self._cur_size >= batch_size,\ + "Size of replay memory must be larger than batch size" + i = 0 + states = np.zeros(( + batch_size,)+self._state_dim, dtype=self._state_dtype) + actions = np.zeros( + (batch_size,)+self._action_dim, dtype=self._action_dtype) + rewards = np.zeros(batch_size, dtype=self._rewards_dtype) + next_states = np.zeros( + (batch_size,)+self._state_dim, dtype=self._state_dtype) + terminals = np.zeros(batch_size, dtype='bool') + + while i < batch_size: + rnd_index = np.random.randint(low=0, high=self._cur_size) + states[i] = self._states.take(rnd_index, axis=0) + actions[i] = self._actions.take(rnd_index, axis=0) + rewards[i] = self._rewards.take(rnd_index, axis=0) + next_states[i] = self._next_states.take(rnd_index, axis=0) + terminals[i] = self._terminals.take(rnd_index, axis=0) + i += 1 + + return states, actions, rewards, next_states, terminals + + +class OnlineReplayMemory(ReplayMemory): + def __init__( + self, state_dim, state_dtype='float32', action_dim=(1,), + action_dtype='uint8', rewards_dtype='float32' + ): + super(OnlineReplayMemory, self).__init__( + state_dim, sample_size=1, size=1, state_dtype=state_dtype, + action_dim=action_dim, action_dtype=action_dtype, + rewards_dtype=rewards_dtype) + + +class CombinedReplayMemory(ReplayMemory): + def __init__( + self, state_dim, sample_size, size=1000, state_dtype='float32', + action_dim=(1,), action_dtype='uint8', rewards_dtype='float32' + ): + super(CombinedReplayMemory, self).__init__( + state_dim=state_dim, sample_size=(sample_size - 1), size=size, + state_dtype=state_dtype, action_dim=action_dim, + action_dtype=action_dtype, rewards_dtype=rewards_dtype) + + self._last_state = np.zeros((1,) + state_dim, dtype=state_dtype) + self._last_action = np.array((1,) + action_dim, dtype=action_dtype) + self._last_reward = np.array([0], dtype=rewards_dtype) + self._last_next_state = np.zeros((1,) + state_dim, dtype=state_dtype) + self._last_terminal = np.array([0], dtype='bool') + + def append(self, state, action, reward, next_state, terminal): + super(CombinedReplayMemory, self).append( + state, action, reward, next_state, terminal) + self._last_state = state + self._last_action = action + self._last_reward = reward + self._last_next_state = next_state + self._last_terminal = terminal + + def sample(self, batch_size=None): + batch_size = (batch_size-1) if batch_size is not None\ + else self._sample_size + states, actions, rewards, next_states, terminals = super( + CombinedReplayMemory, self).sample(batch_size=batch_size) + states = np.append(states, [self._last_state], axis=0) + actions = np.append(actions, [self._last_action], axis=0) + rewards = np.append(rewards, [self._last_reward], axis=0) + next_states = np.append(next_states, [self._last_next_state], axis=0) + terminals = np.append(terminals, [self._last_terminal], axis=0) + return states, actions, rewards, next_states, terminals \ No newline at end of file diff --git a/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/strategy.py b/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/strategy.py new file mode 100644 index 0000000..4c47adc --- /dev/null +++ b/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/strategy.py @@ -0,0 +1,172 @@ +import numpy as np + + +class StrategyBuilder(object): + def __init__(self): + pass + + def build_by_params( + self, + method='epsgreedy', + epsilon=0.5, + min_epsilon=0.05, + epsilon_decay_method='no', + epsilon_decay=0.0, + epsilon_decay_start=0, + action_dim=None, + action_low=None, + action_high=None, + mu=0.0, + theta=0.5, + sigma=0.3 + ): + + if epsilon_decay_method == 'linear': + decay = LinearDecay( + eps_decay=epsilon_decay, min_eps=min_epsilon, + decay_start=epsilon_decay_start) + else: + decay = NoDecay() + + if method == 'epsgreedy': + assert action_dim is not None + assert len(action_dim) == 1 + return EpsilonGreedyStrategy( + eps=epsilon, number_of_actions=action_dim[0], + decay_method=decay) + elif method == 'ornstein_uhlenbeck': + assert action_dim is not None + assert action_low is not None + assert action_high is not None + assert mu is not None + assert theta is not None + assert sigma is not None + return OrnsteinUhlenbeckStrategy( + action_dim, action_low, action_high, epsilon, mu, theta, + sigma, decay) + else: + assert action_dim is not None + assert len(action_dim) == 1 + return GreedyStrategy() + + +class BaseDecay(object): + def __init__(self): + pass + + def decay(self, *args): + raise NotImplementedError + + def __call__(self, *args): + return self.decay(*args) + + +class NoDecay(BaseDecay): + def __init__(self): + super(NoDecay, self).__init__() + + def decay(self, cur_eps, episode): + return cur_eps + + +class LinearDecay(BaseDecay): + def __init__(self, eps_decay, min_eps=0, decay_start=0): + super(LinearDecay, self).__init__() + self.eps_decay = eps_decay + self.min_eps = min_eps + self.decay_start = decay_start + + def decay(self, cur_eps, episode): + if episode < self.decay_start: + return cur_eps + else: + return max(cur_eps - self.eps_decay, self.min_eps) + + +class BaseStrategy(object): + def __init__(self, decay_method): + self._decay_method = decay_method + + def select_action(self, values, decay_method): + raise NotImplementedError + + def decay(self, episode): + self.cur_eps = self._decay_method.decay(self.cur_eps, episode) + + def reset(self): + pass + + +class EpsilonGreedyStrategy(BaseStrategy): + def __init__(self, eps, number_of_actions, decay_method): + super(EpsilonGreedyStrategy, self).__init__(decay_method) + self.eps = eps + self.cur_eps = eps + self.__number_of_actions = number_of_actions + + def select_action(self, values): + do_exploration = (np.random.rand() < self.cur_eps) + if do_exploration: + action = np.random.randint(low=0, high=self.__number_of_actions) + else: + action = values.asnumpy().argmax() + return action + + +class GreedyStrategy(BaseStrategy): + def __init__(self): + super(GreedyStrategy, self).__init__(None) + + def select_action(self, values): + return values.asnumpy().argmax() + + def decay(self): + pass + + +class OrnsteinUhlenbeckStrategy(BaseStrategy): + """ + Ornstein-Uhlenbeck process: dxt = theta * (mu - xt) * dt + sigma * dWt + where Wt denotes the Wiener process. + """ + def __init__( + self, + action_dim, + action_low, + action_high, + eps, + mu=0.0, + theta=.15, + sigma=.3, + decay=NoDecay() + ): + super(OrnsteinUhlenbeckStrategy, self).__init__(decay) + self.eps = eps + self.cur_eps = eps + + self._decay_method = decay + + self._action_dim = action_dim + self._action_low = action_low + self._action_high = action_high + + self._mu = np.array(mu) + self._theta = np.array(theta) + self._sigma = np.array(sigma) + + self.state = np.ones(self._action_dim) * self._mu + + def _evolve_state(self): + x = self.state + dx = self._theta * (self._mu - x)\ + + self._sigma * np.random.randn(len(x)) + self.state = x + dx + return self.state + + def reset(self): + self.state = np.ones(self._action_dim) * self._mu + + def select_action(self, values): + noise = self._evolve_state() + action = values + (self.cur_eps * noise) + return np.clip(action, self._action_low, self._action_high) diff --git a/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/util.py b/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/util.py new file mode 100644 index 0000000..a1da1ce --- /dev/null +++ b/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/util.py @@ -0,0 +1,276 @@ +import signal +import sys +import numpy as np +import matplotlib.pyplot as plt +from matplotlib import style +import time +import os +import mxnet as mx +from mxnet import gluon, nd +import cnnarch_logger + +LOSS_FUNCTIONS = { + 'l1': gluon.loss.L1Loss(), + 'euclidean': gluon.loss.L2Loss(), + 'huber_loss': gluon.loss.HuberLoss(), + 'softmax_cross_entropy': gluon.loss.SoftmaxCrossEntropyLoss(), + 'sigmoid_cross_entropy': gluon.loss.SigmoidBinaryCrossEntropyLoss()} + + +def make_directory_if_not_exist(directory): + if not os.path.exists(directory): + os.makedirs(directory) + + +def copy_net(net, input_state_dim, ctx): + assert isinstance(net, gluon.HybridBlock) + assert type(net.__class__) is type + + net2 = net.__class__() + net2.collect_params().initialize(mx.init.Zero(), ctx=ctx) + net2.hybridize() + net2(mx.nd.ones((1,) + input_state_dim, ctx=ctx)) + + params_of_net = [p.data() for _, p in net.collect_params().items()] + for i, (_, p) in enumerate(net2.collect_params().items()): + p.set_data(params_of_net[i]) + + return net2 + + +def copy_net_with_two_inputs(net, input_state_dim1, input_state_dim2, ctx): + assert isinstance(net, gluon.HybridBlock) + assert type(net.__class__) is type + + net2 = net.__class__() + net2.collect_params().initialize(mx.init.Zero(), ctx=ctx) + net2.hybridize() + net2( + nd.ones((1,) + input_state_dim1, ctx=ctx), + nd.ones((1,) + input_state_dim2, ctx=ctx)) + + params_of_net = [p.data() for _, p in net.collect_params().items()] + for i, (_, p) in enumerate(net2.collect_params().items()): + p.set_data(params_of_net[i]) + + return net2 + + +def get_loss_function(loss_function_name): + if loss_function_name not in LOSS_FUNCTIONS: + raise ValueError('Loss function does not exist') + return LOSS_FUNCTIONS[loss_function_name] + + +class AgentSignalHandler(object): + def __init__(self): + signal.signal(signal.SIGINT, self.interrupt_training) + self.__agent = None + self.__times_interrupted = 0 + + def register_agent(self, agent): + self.__agent = agent + + def interrupt_training(self, sig, frame): + self.__times_interrupted = self.__times_interrupted + 1 + if self.__times_interrupted <= 3: + if self.__agent: + self.__agent.set_interrupt_flag(True) + else: + print('Interrupt called three times: Force quit') + sys.exit(1) + +style.use('fivethirtyeight') + + +class TrainingStats(object): + def __init__(self, max_episodes): + self._logger = cnnarch_logger.ArchLogger.get_logger() + self._max_episodes = max_episodes + self._all_total_rewards = np.zeros((max_episodes,)) + self._all_eps = np.zeros((max_episodes,)) + self._all_time = np.zeros((max_episodes,)) + self._all_mean_reward_last_100_episodes = np.zeros((max_episodes,)) + + @property + def logger(self): + return self._logger + + @logger.setter + def logger(self, logger): + self._logger = logger + + @logger.deleter + def logger(self): + self._logger = None + + def add_total_reward(self, episode, total_reward): + self._all_total_rewards[episode] = total_reward + + def add_eps(self, episode, eps): + self._all_eps[episode] = eps + + def add_time(self, episode, time): + self._all_time[episode] = time + + def add_mean_reward_last_100(self, episode, mean_reward): + self._all_mean_reward_last_100_episodes[episode] = mean_reward + + def log_episode(self, *args): + raise NotImplementedError + + def mean_of_reward(self, cur_episode, last=100): + if cur_episode > 0: + reward_last_100 =\ + self._all_total_rewards[max(0, cur_episode-last):cur_episode] + return np.mean(reward_last_100) + else: + return self._all_total_rewards[0] + + def save(self, path): + np.save(os.path.join(path, 'total_rewards'), self._all_total_rewards) + np.save(os.path.join(path, 'eps'), self._all_eps) + np.save(os.path.join(path, 'time'), self._all_time) + np.save( + os.path.join(path, 'mean_reward'), + self._all_mean_reward_last_100_episodes) + + def _log_episode(self, episode, start_time, training_steps, eps, reward): + self.add_eps(episode, eps) + self.add_total_reward(episode, reward) + end = time.time() + mean_reward_last_100 = self.mean_of_reward(episode, last=100) + time_elapsed = end - start_time + self.add_time(episode, time_elapsed) + self.add_mean_reward_last_100(episode, mean_reward_last_100) + return ('Episode: %d, Total Reward: %.3f, ' + 'Avg. Reward Last 100 Episodes: %.3f, {}, ' + 'Time: %.3f, Training Steps: %d, Eps: %.3f') % ( + episode, reward, mean_reward_last_100, time_elapsed, + training_steps, eps), mean_reward_last_100 + + +class DqnTrainingStats(TrainingStats): + def __init__(self, max_episodes): + super(DqnTrainingStats, self).__init__(max_episodes) + self._all_avg_loss = np.zeros((max_episodes,)) + + def add_avg_loss(self, episode, avg_loss): + self._all_avg_loss[episode] = avg_loss + + def log_episode( + self, episode, start_time, training_steps, avg_loss, eps, reward + ): + self.add_avg_loss(episode, avg_loss) + + info, avg_reward = self._log_episode( + episode, start_time, training_steps, eps, reward) + info = info.format(('Avg. Loss: %.3f') % (avg_loss)) + + self._logger.info(info) + return avg_reward + + def save_stats(self, path): + fig = plt.figure(figsize=(20, 20)) + + sub_rewards = fig.add_subplot(221) + sub_rewards.set_title('Total Rewards per episode') + sub_rewards.plot( + np.arange(self._max_episodes), self._all_total_rewards) + + sub_loss = fig.add_subplot(222) + sub_loss.set_title('Avg. Loss per episode') + sub_loss.plot(np.arange(self._max_episodes), self._all_avg_loss) + + sub_eps = fig.add_subplot(223) + sub_eps.set_title('Epsilon per episode') + sub_eps.plot(np.arange(self._max_episodes), self._all_eps) + + sub_rewards = fig.add_subplot(224) + sub_rewards.set_title('Avg. mean reward of last 100 episodes') + sub_rewards.plot(np.arange(self._max_episodes), + self._all_mean_reward_last_100_episodes) + + self.save(path) + plt.savefig(os.path.join(path, 'stats.pdf')) + + def save(self, path): + super(DqnTrainingStats, self).save(path) + np.save(os.path.join(path, 'avg_loss'), self._all_avg_loss) + + +class DdpgTrainingStats(TrainingStats): + def __init__(self, max_episodes): + super(DdpgTrainingStats, self).__init__(max_episodes) + self._all_avg_critic_loss = np.zeros((max_episodes,)) + self._all_avg_actor_loss = np.zeros((max_episodes,)) + self._all_avg_qvalues = np.zeros((max_episodes,)) + + def add_avg_critic_loss(self, episode, avg_critic_loss): + self._all_avg_critic_loss[episode] = avg_critic_loss + + def add_avg_actor_loss(self, episode, avg_actor_loss): + self._all_avg_actor_loss[episode] = avg_actor_loss + + def add_avg_qvalues(self, episode, avg_qvalues): + self._all_avg_qvalues[episode] = avg_qvalues + + def log_episode( + self, episode, start_time, training_steps, actor_loss, + critic_loss, qvalues, eps, reward + ): + self.add_avg_actor_loss(episode, actor_loss) + self.add_avg_critic_loss(episode, critic_loss) + self.add_avg_qvalues(episode, qvalues) + + info, avg_reward = self._log_episode( + episode, start_time, training_steps, eps, reward) + info = info.format(( + 'Avg. Actor Loss: %.3f ' + 'Avg. Critic Loss: %.3f ' + 'Avg. Q-Values: %.3f') % (actor_loss, critic_loss, qvalues)) + + self.logger.info(info) + return avg_reward + + def save(self, path): + super(DdpgTrainingStats, self).save(path) + np.save(os.path.join( + path, 'avg_critic_loss'), self._all_avg_critic_loss) + np.save(os.path.join(path, 'avg_actor_loss'), self._all_avg_actor_loss) + np.save(os.path.join(path, 'avg_qvalues'), self._all_avg_qvalues) + + def save_stats(self, path): + fig = plt.figure(figsize=(120, 120)) + + sub_rewards = fig.add_subplot(321) + sub_rewards.set_title('Total Rewards per episode') + sub_rewards.plot( + np.arange(self._max_episodes), self._all_total_rewards) + + sub_actor_loss = fig.add_subplot(322) + sub_actor_loss.set_title('Avg. Actor Loss per episode') + sub_actor_loss.plot( + np.arange(self._max_episodes), self._all_avg_actor_loss) + + sub_critic_loss = fig.add_subplot(323) + sub_critic_loss.set_title('Avg. Critic Loss per episode') + sub_critic_loss.plot( + np.arange(self._max_episodes), self._all_avg_critic_loss) + + sub_qvalues = fig.add_subplot(324) + sub_qvalues.set_title('Avg. QValues per episode') + sub_qvalues.plot( + np.arange(self._max_episodes), self._all_avg_qvalues) + + sub_eps = fig.add_subplot(325) + sub_eps.set_title('Epsilon per episode') + sub_eps.plot(np.arange(self._max_episodes), self._all_eps) + + sub_rewards = fig.add_subplot(326) + sub_rewards.set_title('Avg. mean reward of last 100 episodes') + sub_rewards.plot(np.arange(self._max_episodes), + self._all_mean_reward_last_100_episodes) + + self.save(path) + plt.savefig(os.path.join(path, 'stats.pdf')) diff --git a/src/test/resources/target_code/ReinforcementConfig3/start_training.sh b/src/test/resources/target_code/ReinforcementConfig3/start_training.sh new file mode 100644 index 0000000..e058138 --- /dev/null +++ b/src/test/resources/target_code/ReinforcementConfig3/start_training.sh @@ -0,0 +1,2 @@ +#!/bin/bash +python CNNTrainer_reinforcementConfig3.py \ No newline at end of file diff --git a/src/test/resources/target_code/ddpg/CNNTrainer_actorNetwork.py b/src/test/resources/target_code/ddpg/CNNTrainer_actorNetwork.py new file mode 100644 index 0000000..ca7a4e6 --- /dev/null +++ b/src/test/resources/target_code/ddpg/CNNTrainer_actorNetwork.py @@ -0,0 +1,136 @@ +from reinforcement_learning.agent import DdpgAgent +from reinforcement_learning.util import AgentSignalHandler +from reinforcement_learning.cnnarch_logger import ArchLogger +from reinforcement_learning.CNNCreator_CriticNetwork import CNNCreator_CriticNetwork +import reinforcement_learning.environment +import CNNCreator_actorNetwork + +import os +import sys +import re +import time +import numpy as np +import mxnet as mx + + +def resume_session(sessions_dir): + resume_session = False + resume_directory = None + if os.path.isdir(sessions_dir): + regex = re.compile(r'\d\d\d\d-\d\d-\d\d-\d\d-\d\d') + dir_content = os.listdir(sessions_dir) + session_files = filter(regex.search, dir_content) + session_files.sort(reverse=True) + for d in session_files: + interrupted_session_dir = os.path.join(sessions_dir, d, '.interrupted_session') + if os.path.isdir(interrupted_session_dir): + resume = raw_input('Interrupted session from {} found. Do you want to resume? (y/n) '.format(d)) + if resume == 'y': + resume_session = True + resume_directory = interrupted_session_dir + break + return resume_session, resume_directory + + +if __name__ == "__main__": + agent_name = 'ddpg-agent' + # Prepare output directory and logger + all_output_dir = os.path.join('model', agent_name) + output_directory = os.path.join( + all_output_dir, + time.strftime('%Y-%m-%d-%H-%M-%S', + time.localtime(time.time()))) + ArchLogger.set_output_directory(output_directory) + ArchLogger.set_logger_name(agent_name) + ArchLogger.set_output_level(ArchLogger.INFO) + + env = reinforcement_learning.environment.GymEnvironment('CartPole-v0') + + context = mx.cpu() + actor_creator = CNNCreator_actorNetwork.CNNCreator_actorNetwork() + actor_creator.construct(context) + critic_creator = CNNCreator_CriticNetwork() + critic_creator.construct(context) + + agent_params = { + 'environment': env, + 'replay_memory_params': { + 'method': 'buffer', + 'memory_size': 1000000, + 'sample_size': 64, + 'state_dtype': 'float32', + 'action_dtype': 'float32', + 'rewards_dtype': 'float32' + }, + 'strategy_params': { + 'method':'ornstein_uhlenbeck', + 'epsilon': 1, + 'min_epsilon': 0.001, + 'epsilon_decay_method': 'linear', + 'epsilon_decay': 0.0001, + 'epsilon_decay_start': 50, + 'action_low': -1, + 'action_high': 1, + 'mu': [0, 0.1, 0.3], + 'theta': [0.5, 0, 0.8], + 'sigma': [0.3, 0.6, -0.9], + }, + 'agent_name': agent_name, + 'verbose': True, + 'output_directory': output_directory, + 'state_dim': (8,), + 'action_dim': (3,), + 'discount_factor': 0.99999, + 'training_episodes': 1000, + 'train_interval': 1, + 'start_training': 100, + 'snapshot_interval': 50, + 'max_episode_step': 250, + 'evaluation_samples': 100, + 'target_score': 185.5, + 'actor': actor_creator.net, + 'critic': critic_creator.net, + 'soft_target_update_rate': 0.001, + 'actor_optimizer': 'adam', + 'actor_optimizer_params': { + 'learning_rate_minimum': 1.0E-5, + 'learning_rate': 0.001}, + 'critic_optimizer': 'rmsprop', + 'critic_optimizer_params': { + 'weight_decay': 0.01, + 'centered': True, + 'gamma2': 0.9, + 'gamma1': 0.9, + 'clip_weights': 10.0, + 'learning_rate_decay': 0.9, + 'epsilon': 1.0E-6, + 'rescale_grad': 1.1, + 'clip_gradient': 10.0, + 'learning_rate_minimum': 1.0E-5, + 'learning_rate_policy': 'step', + 'learning_rate': 0.001, + 'step_size': 1000}, + } + + resume, resume_directory = resume_session(all_output_dir) + + if resume: + output_directory, _ = os.path.split(resume_directory) + ArchLogger.set_output_directory(output_directory) + resume_agent_params = { + 'session_dir': resume_directory, + 'environment': env, + 'actor': actor_creator.net, + 'critic': critic_creator.net + } + agent = DdpgAgent.resume_from_session(**resume_agent_params) + else: + agent = DdpgAgent(**agent_params) + + signal_handler = AgentSignalHandler() + signal_handler.register_agent(agent) + + train_successful = agent.train() + + if train_successful: + agent.save_best_network(actor_creator._model_dir_ + actor_creator._model_prefix_ + '_newest', epoch=0) diff --git a/src/test/resources/target_code/ddpg/reinforcement_learning/CNNCreator_CriticNetwork.py b/src/test/resources/target_code/ddpg/reinforcement_learning/CNNCreator_CriticNetwork.py new file mode 100644 index 0000000..46a9ea9 --- /dev/null +++ b/src/test/resources/target_code/ddpg/reinforcement_learning/CNNCreator_CriticNetwork.py @@ -0,0 +1,56 @@ +import mxnet as mx +import logging +import os +from CNNNet_CriticNetwork import Net + +class CNNCreator_CriticNetwork: + _model_dir_ = "model/CriticNetwork/" + _model_prefix_ = "model" + _input_shapes_ = [(8,),(3,)] + + def __init__(self): + self.weight_initializer = mx.init.Normal() + self.net = None + + def get_input_shapes(self): + return self._input_shapes_ + + def load(self, context): + lastEpoch = 0 + param_file = None + + try: + os.remove(self._model_dir_ + self._model_prefix_ + "_newest-0000.params") + except OSError: + pass + try: + os.remove(self._model_dir_ + self._model_prefix_ + "_newest-symbol.json") + except OSError: + pass + + if os.path.isdir(self._model_dir_): + for file in os.listdir(self._model_dir_): + if ".params" in file and self._model_prefix_ in file: + epochStr = file.replace(".params","").replace(self._model_prefix_ + "-","") + epoch = int(epochStr) + if epoch > lastEpoch: + lastEpoch = epoch + param_file = file + if param_file is None: + return 0 + else: + logging.info("Loading checkpoint: " + param_file) + self.net.load_parameters(self._model_dir_ + param_file) + return lastEpoch + + + def construct(self, context, data_mean=None, data_std=None): + self.net = Net(data_mean=data_mean, data_std=data_std) + self.net.collect_params().initialize(self.weight_initializer, ctx=context) + self.net.hybridize() + self.net(mx.nd.zeros((1,)+self._input_shapes_[0], ctx=context),mx.nd.zeros((1,)+self._input_shapes_[1], ctx=context)) + + if not os.path.exists(self._model_dir_): + os.makedirs(self._model_dir_) + + self.net.export(self._model_dir_ + self._model_prefix_, epoch=0) diff --git a/src/test/resources/target_code/ddpg/reinforcement_learning/CNNNet_CriticNetwork.py b/src/test/resources/target_code/ddpg/reinforcement_learning/CNNNet_CriticNetwork.py new file mode 100644 index 0000000..8444a87 --- /dev/null +++ b/src/test/resources/target_code/ddpg/reinforcement_learning/CNNNet_CriticNetwork.py @@ -0,0 +1,119 @@ +import mxnet as mx +import numpy as np +from mxnet import gluon + +class Softmax(gluon.HybridBlock): + def __init__(self, **kwargs): + super(Softmax, self).__init__(**kwargs) + + def hybrid_forward(self, F, x): + return F.softmax(x) + + +class Split(gluon.HybridBlock): + def __init__(self, num_outputs, axis=1, **kwargs): + super(Split, self).__init__(**kwargs) + with self.name_scope(): + self.axis = axis + self.num_outputs = num_outputs + + def hybrid_forward(self, F, x): + return F.split(data=x, axis=self.axis, num_outputs=self.num_outputs) + + +class Concatenate(gluon.HybridBlock): + def __init__(self, dim=1, **kwargs): + super(Concatenate, self).__init__(**kwargs) + with self.name_scope(): + self.dim = dim + + def hybrid_forward(self, F, *x): + return F.concat(*x, dim=self.dim) + + +class ZScoreNormalization(gluon.HybridBlock): + def __init__(self, data_mean, data_std, **kwargs): + super(ZScoreNormalization, self).__init__(**kwargs) + with self.name_scope(): + self.data_mean = self.params.get('data_mean', shape=data_mean.shape, + init=mx.init.Constant(data_mean.asnumpy().tolist()), differentiable=False) + self.data_std = self.params.get('data_std', shape=data_mean.shape, + init=mx.init.Constant(data_std.asnumpy().tolist()), differentiable=False) + + def hybrid_forward(self, F, x, data_mean, data_std): + x = F.broadcast_sub(x, data_mean) + x = F.broadcast_div(x, data_std) + return x + + +class Padding(gluon.HybridBlock): + def __init__(self, padding, **kwargs): + super(Padding, self).__init__(**kwargs) + with self.name_scope(): + self.pad_width = padding + + def hybrid_forward(self, F, x): + x = F.pad(data=x, + mode='constant', + pad_width=self.pad_width, + constant_value=0) + return x + + +class NoNormalization(gluon.HybridBlock): + def __init__(self, **kwargs): + super(NoNormalization, self).__init__(**kwargs) + + def hybrid_forward(self, F, x): + return x + + +class Net(gluon.HybridBlock): + def __init__(self, data_mean=None, data_std=None, **kwargs): + super(Net, self).__init__(**kwargs) + with self.name_scope(): + if not data_mean is None: + assert(not data_std is None) + self.state_input_normalization = ZScoreNormalization(data_mean=data_mean, data_std=data_std) + else: + self.state_input_normalization = NoNormalization() + + self.fc2_1_ = gluon.nn.Dense(units=300, use_bias=True) + # fc2_1_, output shape: {[300,1,1]} + + self.relu2_1_ = gluon.nn.Activation(activation='relu') + self.fc3_1_ = gluon.nn.Dense(units=600, use_bias=True) + # fc3_1_, output shape: {[600,1,1]} + + if not data_mean is None: + assert(not data_std is None) + self.action_input_normalization = ZScoreNormalization(data_mean=data_mean, data_std=data_std) + else: + self.action_input_normalization = NoNormalization() + + self.fc2_2_ = gluon.nn.Dense(units=600, use_bias=True) + # fc2_2_, output shape: {[600,1,1]} + + self.fc4_ = gluon.nn.Dense(units=600, use_bias=True) + # fc4_, output shape: {[600,1,1]} + + self.relu4_ = gluon.nn.Activation(activation='relu') + self.fc5_ = gluon.nn.Dense(units=1, use_bias=True) + # fc5_, output shape: {[1,1,1]} + + + self.last_layer = 'linear' + + + def hybrid_forward(self, F, state, action): + state = self.state_input_normalization(state) + fc2_1_ = self.fc2_1_(state) + relu2_1_ = self.relu2_1_(fc2_1_) + fc3_1_ = self.fc3_1_(relu2_1_) + action = self.action_input_normalization(action) + fc2_2_ = self.fc2_2_(action) + add4_ = fc3_1_ + fc2_2_ + fc4_ = self.fc4_(add4_) + relu4_ = self.relu4_(fc4_) + fc5_ = self.fc5_(relu4_) + return fc5_ diff --git a/src/test/resources/target_code/ddpg/reinforcement_learning/__init__.py b/src/test/resources/target_code/ddpg/reinforcement_learning/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/test/resources/target_code/ddpg/reinforcement_learning/agent.py b/src/test/resources/target_code/ddpg/reinforcement_learning/agent.py new file mode 100644 index 0000000..c06b32a --- /dev/null +++ b/src/test/resources/target_code/ddpg/reinforcement_learning/agent.py @@ -0,0 +1,900 @@ +import mxnet as mx +import numpy as np +import time +import os +import sys +import util +import matplotlib.pyplot as plt +import pyprind +from cnnarch_logger import ArchLogger +from replay_memory import ReplayMemoryBuilder +from strategy import StrategyBuilder +from util import copy_net, get_loss_function,\ + copy_net_with_two_inputs, DdpgTrainingStats, DqnTrainingStats,\ + make_directory_if_not_exist +from mxnet import nd, gluon, autograd + + +class Agent(object): + def __init__( + self, + environment, + replay_memory_params, + strategy_params, + state_dim, + action_dim, + ctx=None, + discount_factor=.9, + training_episodes=50, + train_interval=1, + start_training=0, + snapshot_interval=200, + agent_name='Agent', + max_episode_step=99999, + evaluation_samples=1000, + output_directory='model_parameters', + verbose=True, + target_score=None + ): + assert 0 < discount_factor <= 1,\ + 'Discount factor must be between 0 and 1' + assert train_interval > 0, 'Train interval must be greater 0' + assert snapshot_interval > 0, 'Snapshot interval must be greater 0' + assert max_episode_step > 0,\ + 'Maximal steps per episode must be greater 0' + assert training_episodes > 0, 'Trainings episode must be greater 0' + assert replay_memory_params is not None,\ + 'Replay memory parameter not set' + assert type(state_dim) is tuple, 'State dimension is not a tuple' + assert type(action_dim) is tuple, 'Action dimension is not a tuple' + + self._logger = ArchLogger.get_logger() + self._ctx = mx.gpu() if ctx == 'gpu' else mx.cpu() + self._environment = environment + self._discount_factor = discount_factor + self._training_episodes = training_episodes + self._train_interval = train_interval + self._verbose = verbose + self._state_dim = state_dim + + replay_memory_params['state_dim'] = state_dim + replay_memory_params['action_dim'] = action_dim + self._replay_memory_params = replay_memory_params + rm_builder = ReplayMemoryBuilder() + self._memory = rm_builder.build_by_params(**replay_memory_params) + self._minibatch_size = self._memory.sample_size + self._action_dim = action_dim + + strategy_params['action_dim'] = self._action_dim + self._strategy_params = strategy_params + strategy_builder = StrategyBuilder() + self._strategy = strategy_builder.build_by_params(**strategy_params) + self._agent_name = agent_name + self._snapshot_interval = snapshot_interval + self._creation_time = time.time() + self._max_episode_step = max_episode_step + self._start_training = start_training + self._output_directory = output_directory + self._target_score = target_score + + self._evaluation_samples = evaluation_samples + self._best_avg_score = -np.infty + self._best_net = None + + self._interrupt_flag = False + self._training_stats = None + + # Training Context + self._current_episode = 0 + self._total_steps = 0 + + @property + def current_episode(self): + return self._current_episode + + @property + def environment(self): + return self._environment + + def save_config_file(self): + import json + make_directory_if_not_exist(self._output_directory) + filename = os.path.join(self._output_directory, 'config.json') + config = self._make_config_dict() + with open(filename, mode='w') as fp: + json.dump(config, fp, indent=4) + + def set_interrupt_flag(self, interrupt): + self._interrupt_flag = interrupt + + def _interrupt_training(self): + import pickle + self._logger.info('Training interrupted; Store state for resuming') + session_dir = self._get_session_dir() + agent_session_file = os.path.join(session_dir, 'agent.p') + logger = self._logger + + self._make_pickle_ready(session_dir) + + with open(agent_session_file, 'wb') as f: + pickle.dump(self, f, protocol=2) + logger.info('State successfully stored') + + def _make_pickle_ready(self, session_dir): + del self._training_stats.logger + self._logger = None + self._environment.close() + self._environment = None + self._save_net(self._best_net, 'best_net', session_dir) + self._best_net = None + + def _make_config_dict(self): + config = dict() + config['state_dim'] = self._state_dim + config['action_dim'] = self._action_dim + config['ctx'] = str(self._ctx) + config['discount_factor'] = self._discount_factor + config['strategy_params'] = self._strategy_params + config['replay_memory_params'] = self._replay_memory_params + config['training_episodes'] = self._training_episodes + config['start_training'] = self._start_training + config['evaluation_samples'] = self._evaluation_samples + config['train_interval'] = self._train_interval + config['snapshot_interval'] = self._snapshot_interval + config['agent_name'] = self._agent_name + config['max_episode_step'] = self._max_episode_step + config['output_directory'] = self._output_directory + config['verbose'] = self._verbose + config['target_score'] = self._target_score + return config + + def _adjust_optimizer_params(self, optimizer_params): + if 'weight_decay' in optimizer_params: + optimizer_params['wd'] = optimizer_params['weight_decay'] + del optimizer_params['weight_decay'] + if 'learning_rate_decay' in optimizer_params: + min_learning_rate = 1e-8 + if 'learning_rate_minimum' in optimizer_params: + min_learning_rate = optimizer_params['learning_rate_minimum'] + del optimizer_params['learning_rate_minimum'] + optimizer_params['lr_scheduler'] = mx.lr_scheduler.FactorScheduler( + optimizer_params['step_size'], + factor=optimizer_params['learning_rate_decay'], + stop_factor_lr=min_learning_rate) + del optimizer_params['step_size'] + del optimizer_params['learning_rate_decay'] + + return optimizer_params + + def _sample_from_memory(self): + states, actions, rewards, next_states, terminals\ + = self._memory.sample(batch_size=self._minibatch_size) + states = nd.array(states, ctx=self._ctx) + actions = nd.array(actions, ctx=self._ctx) + rewards = nd.array(rewards, ctx=self._ctx) + next_states = nd.array(next_states, ctx=self._ctx) + terminals = nd.array(terminals, ctx=self._ctx) + return states, actions, rewards, next_states, terminals + + def evaluate(self, target=None, sample_games=100, verbose=True): + target = self._target_score if target is None else target + if target: + target_achieved = 0 + total_reward = 0 + + self._logger.info('Sampling from {} games...'.format(sample_games)) + for g in pyprind.prog_bar(range(sample_games)): + state = self._environment.reset() + step = 0 + game_reward = 0 + terminal = False + while not terminal and (step < self._max_episode_step): + action = self.get_next_action(state) + state, reward, terminal, _ = self._environment.step(action) + game_reward += reward + step += 1 + + if verbose: + info = 'Game %d: Reward %f' % (g, game_reward) + self._logger.debug(info) + if target: + if game_reward >= target: + target_achieved += 1 + total_reward += game_reward + + avg_reward = float(total_reward)/float(sample_games) + info = 'Avg. Reward: %f' % avg_reward + if target: + target_achieved_ratio = int( + (float(target_achieved)/float(sample_games))*100) + info += '; Target Achieved in %d%% of games'\ + % (target_achieved_ratio) + + if verbose: + self._logger.info(info) + return avg_reward + + def _do_snapshot_if_in_interval(self, episode): + do_snapshot =\ + (episode != 0 and (episode % self._snapshot_interval == 0)) + if do_snapshot: + self.save_parameters(episode=episode) + self._evaluate() + + def _evaluate(self, verbose=True): + avg_reward = self.evaluate( + sample_games=self._evaluation_samples, verbose=False) + info = 'Evaluation -> Average Reward in {} games: {}'.format( + self._evaluation_samples, avg_reward) + + if self._best_avg_score is None or self._best_avg_score <= avg_reward: + self._save_current_as_best_net() + self._best_avg_score = avg_reward + if verbose: + self._logger.info(info) + + def _is_target_reached(self, avg_reward): + return self._target_score is not None\ + and avg_reward > self._target_score + + def _do_training(self): + return (self._total_steps % self._train_interval == 0) and\ + (self._memory.is_sample_possible(self._minibatch_size)) and\ + (self._current_episode >= self._start_training) + + def _check_interrupt_routine(self): + if self._interrupt_flag: + self._interrupt_flag = False + self._interrupt_training() + return True + return False + + def _is_target_reached(self, avg_reward): + return self._target_score is not None\ + and avg_reward > self._target_score + + def _save_parameters(self, net, episode=None, filename='dqn-agent-params'): + assert self._output_directory + assert isinstance(net, gluon.HybridBlock) + make_directory_if_not_exist(self._output_directory) + + if(episode is not None): + self._logger.info( + 'Saving model parameters after episode %d' % episode) + filename = filename + '-ep{}'.format(episode) + else: + self._logger.info('Saving model parameters') + self._save_net(net, filename) + + def _save_net(self, net, filename, filedir=None): + filedir = self._output_directory if filedir is None else filedir + filename = os.path.join(filedir, filename + '.params') + net.save_parameters(filename) + + def save_best_network(self, path, epoch=0): + self._logger.info( + 'Saving best network with average reward of {}'.format( + self._best_avg_score)) + self._best_net.export(path, epoch=epoch) + + def _get_session_dir(self): + session_dir = os.path.join( + self._output_directory, '.interrupted_session') + make_directory_if_not_exist(session_dir) + return session_dir + + def _save_current_as_best_net(self): + raise NotImplementedError + + def get_next_action(self, state): + raise NotImplementedError + + def save_parameters(self, episode): + raise NotImplementedError + + def train(self, episodes=None): + raise NotImplementedError + + +class DdpgAgent(Agent): + def __init__( + self, + actor, + critic, + environment, + replay_memory_params, + strategy_params, + state_dim, + action_dim, + soft_target_update_rate=.001, + actor_optimizer='adam', + actor_optimizer_params={'learning_rate': 0.0001}, + critic_optimizer='adam', + critic_optimizer_params={'learning_rate': 0.001}, + ctx=None, + discount_factor=.9, + training_episodes=50, + start_training=20, + train_interval=1, + snapshot_interval=200, + agent_name='DdpgAgent', + max_episode_step=9999, + evaluation_samples=100, + output_directory='model_parameters', + verbose=True, + target_score=None + ): + super(DdpgAgent, self).__init__( + environment=environment, replay_memory_params=replay_memory_params, + strategy_params=strategy_params, state_dim=state_dim, + action_dim=action_dim, ctx=ctx, discount_factor=discount_factor, + training_episodes=training_episodes, start_training=start_training, + train_interval=train_interval, + snapshot_interval=snapshot_interval, agent_name=agent_name, + max_episode_step=max_episode_step, + output_directory=output_directory, verbose=verbose, + target_score=target_score, evaluation_samples=evaluation_samples) + assert critic is not None, 'Critic not set' + assert actor is not None, 'Actor is not set' + assert soft_target_update_rate > 0,\ + 'Target update must be greater zero' + assert actor_optimizer is not None, 'No actor optimizer set' + assert critic_optimizer is not None, 'No critic optimizer set' + + self._actor = actor + self._critic = critic + + self._actor_target = self._copy_actor() + self._critic_target = self._copy_critic() + + self._actor_optimizer = actor_optimizer + self._actor_optimizer_params = self._adjust_optimizer_params( + actor_optimizer_params) + + self._critic_optimizer = critic_optimizer + self._critic_optimizer_params = self._adjust_optimizer_params( + critic_optimizer_params) + + self._soft_target_update_rate = soft_target_update_rate + + self._logger.info( + 'Agent created with following parameters: {}'.format( + self._make_config_dict())) + + self._best_net = self._copy_actor() + + self._training_stats = DdpgTrainingStats(self._training_episodes) + + def _make_pickle_ready(self, session_dir): + super(DdpgAgent, self)._make_pickle_ready(session_dir) + self._save_net(self._actor, 'actor', session_dir) + self._actor = None + self._save_net(self._critic, 'critic', session_dir) + self._critic = None + self._save_net(self._actor_target, 'actor_target', session_dir) + self._actor_target = None + self._save_net(self._critic_target, 'critic_target', session_dir) + self._critic_target = None + + @classmethod + def resume_from_session(cls, session_dir, actor, critic, environment): + import pickle + if not os.path.exists(session_dir): + raise ValueError('Session directory does not exist') + + files = dict() + files['agent'] = os.path.join(session_dir, 'agent.p') + files['best_net_params'] = os.path.join(session_dir, 'best_net.params') + files['actor_net_params'] = os.path.join(session_dir, 'actor.params') + files['actor_target_net_params'] = os.path.join( + session_dir, 'actor_target.params') + files['critic_net_params'] = os.path.join(session_dir, 'critic.params') + files['critic_target_net_params'] = os.path.join( + session_dir, 'critic_target.params') + + for file in files.values(): + if not os.path.exists(file): + raise ValueError( + 'Session directory is not complete: {} is missing' + .format(file)) + + with open(files['agent'], 'rb') as f: + agent = pickle.load(f) + + agent._environment = environment + + agent._actor = actor + agent._actor.load_parameters(files['actor_net_params'], agent._ctx) + agent._actor.hybridize() + agent._actor(nd.random_normal( + shape=((1,) + agent._state_dim), ctx=agent._ctx)) + + agent._best_net = copy_net(agent._actor, agent._state_dim, agent._ctx) + agent._best_net.load_parameters(files['best_net_params'], agent._ctx) + + agent._actor_target = copy_net( + agent._actor, agent._state_dim, agent._ctx) + agent._actor_target.load_parameters(files['actor_target_net_params']) + + agent._critic = critic + agent._critic.load_parameters(files['critic_net_params'], agent._ctx) + agent._critic.hybridize() + agent._critic( + nd.random_normal(shape=((1,) + agent._state_dim), ctx=agent._ctx), + nd.random_normal(shape=((1,) + agent._action_dim), ctx=agent._ctx)) + + agent._critic_target = copy_net_with_two_inputs( + agent._critic, agent._state_dim, agent._action_dim, agent._ctx) + agent._critic_target.load_parameters(files['critic_target_net_params']) + + agent._logger = ArchLogger.get_logger() + agent._training_stats.logger = ArchLogger.get_logger() + agent._logger.info('Agent was retrieved; Training can be continued') + + return agent + + def _save_current_as_best_net(self): + self._best_net = self._copy_actor() + + def get_next_action(self, state): + action = self._actor(nd.array([state], ctx=self._ctx)) + return action[0].asnumpy() + + def save_parameters(self, episode): + self._save_parameters(self._actor, episode=episode) + + def train(self, episodes=None): + self.save_config_file() + self._logger.info("--- Start DDPG training ---") + episodes = \ + episodes if episodes is not None else self._training_episodes + + resume = (self._current_episode > 0) + if resume: + self._logger.info("Training session resumed") + self._logger.info( + "Starting from episode {}".format(self._current_episode)) + else: + self._training_stats = DdpgTrainingStats(episodes) + + # Initialize target Q' and mu' + self._actor_target = self._copy_actor() + self._critic_target = self._copy_critic() + + # Initialize l2 loss for critic network + l2_loss = gluon.loss.L2Loss() + + # Initialize critic and actor trainer + trainer_actor = gluon.Trainer( + self._actor.collect_params(), self._actor_optimizer, + self._actor_optimizer_params) + trainer_critic = gluon.Trainer( + self._critic.collect_params(), self._critic_optimizer, + self._critic_optimizer_params) + + # For episode=1..n + while self._current_episode < episodes: + # Check interrupt flag + if self._check_interrupt_routine(): + return False + + # Initialize new episode + step = 0 + episode_reward = 0 + start = time.time() + episode_critic_loss = 0 + episode_actor_loss = 0 + episode_avg_q_value = 0 + training_steps = 0 + + # Get initialial observation state s + state = self._environment.reset() + + # For step=1..T + while step < self._max_episode_step: + # Select an action a = mu(s) + N(step) according to current + # actor and exploration noise N according to strategy + action = self._strategy.select_action( + self.get_next_action(state)) + + # Execute action a and observe reward r and next state ns + next_state, reward, terminal, _ = \ + self._environment.step(action) + + self._logger.debug( + 'Applied action {} with reward {}'.format(action, reward)) + + # Store transition (s,a,r,ns) in replay buffer + self._memory.append( + state, action, reward, next_state, terminal) + + if self._do_training(): + # Sample random minibatch of b transitions + # (s_i, a_i, r_i, s_(i+1)) from replay buffer + states, actions, rewards, next_states, terminals =\ + self._sample_from_memory() + + actor_target_actions = self._actor_target(next_states) + critic_target_qvalues = self._critic_target( + next_states, actor_target_actions) + + rewards = rewards.reshape(self._minibatch_size, 1) + terminals = terminals.reshape(self._minibatch_size, 1) + + # y = r_i + discount * Q'(s_(i+1), mu'(s_(i+1))) + y = rewards + (1.0 - terminals) * self._discount_factor\ + * critic_target_qvalues + + # Train the critic network + with autograd.record(): + qvalues = self._critic(states, actions) + critic_loss = l2_loss(qvalues, y) + critic_loss.backward() + trainer_critic.step(self._minibatch_size) + + # Train the actor network + # Temporary critic so that gluon trainer does not mess + # with critic parameters + tmp_critic = self._copy_critic() + with autograd.record(): + actor_qvalues = tmp_critic(states, self._actor(states)) + # For maximizing qvalues we have to multiply with -1 + # as we use a minimizer + actor_loss = -1 * actor_qvalues + actor_loss.backward() + trainer_actor.step(self._minibatch_size) + + # Update target networks: + self._actor_target = self._soft_update( + self._actor, self._actor_target, + self._soft_target_update_rate) + self._critic_target = self._soft_update( + self._critic, self._critic_target, + self._soft_target_update_rate) + + # Update statistics + episode_critic_loss +=\ + np.sum(critic_loss.asnumpy()) / self._minibatch_size + episode_actor_loss +=\ + np.sum(actor_loss.asnumpy()) / self._minibatch_size + episode_avg_q_value +=\ + np.sum(actor_qvalues.asnumpy()) / self._minibatch_size + + training_steps += 1 + + episode_reward += reward + step += 1 + self._total_steps += 1 + state = next_state + + if terminal: + # Reset the strategy + self._strategy.reset() + break + + # Log the episode results + episode_actor_loss = 0 if training_steps == 0\ + else (episode_actor_loss / training_steps) + episode_critic_loss = 0 if training_steps == 0\ + else (episode_critic_loss / training_steps) + episode_avg_q_value = 0 if training_steps == 0\ + else (episode_avg_q_value / training_steps) + + avg_reward = self._training_stats.log_episode( + self._current_episode, start, training_steps, + episode_actor_loss, episode_critic_loss, episode_avg_q_value, + self._strategy.cur_eps, episode_reward) + + self._do_snapshot_if_in_interval(self._current_episode) + self._strategy.decay(self._current_episode) + + if self._is_target_reached(avg_reward): + self._logger.info( + 'Target score is reached in average; Training is stopped') + break + + self._current_episode += 1 + + self._evaluate() + self.save_parameters(episode=self._current_episode) + self.save_best_network(os.path.join(self._output_directory, 'best')) + self._training_stats.save_stats(self._output_directory) + self._logger.info('--------- Training finished ---------') + return True + + def _make_config_dict(self): + config = super(DdpgAgent, self)._make_config_dict() + config['soft_target_update_rate'] = self._soft_target_update_rate + config['actor_optimizer'] = self._actor_optimizer + config['actor_optimizer_params'] = self._actor_optimizer_params + config['critic_optimizer'] = self._critic_optimizer + config['critic_optimizer_params'] = self._critic_optimizer_params + return config + + def _soft_update(self, net, target, tau): + net_params = [p.data() for _, p in net.collect_params().items()] + for i, (_, p) in enumerate(target.collect_params().items()): + target_params = p.data() + p.set_data((1.0 - tau) * target_params + tau * net_params[i]) + return target + + def _copy_actor(self): + assert self._actor is not None + assert self._ctx is not None + assert type(self._state_dim) is tuple + return copy_net(self._actor, self._state_dim, ctx=self._ctx) + + def _copy_critic(self): + assert self._critic is not None + assert self._ctx is not None + assert type(self._state_dim) is tuple + assert type(self._action_dim) is tuple + return copy_net_with_two_inputs( + self._critic, self._state_dim, self._action_dim, ctx=self._ctx) + + +class DqnAgent(Agent): + def __init__( + self, + qnet, + environment, + replay_memory_params, + strategy_params, + state_dim, + action_dim, + ctx=None, + discount_factor=.9, + loss_function='euclidean', + optimizer='rmsprop', + optimizer_params={'learning_rate': 0.09}, + training_episodes=50, + start_training=0, + train_interval=1, + use_fix_target=False, + double_dqn=False, + target_update_interval=10, + snapshot_interval=200, + evaluation_samples=100, + agent_name='Dqn_agent', + max_episode_step=99999, + output_directory='model_parameters', + verbose=True, + target_score=None + ): + super(DqnAgent, self).__init__( + environment=environment, replay_memory_params=replay_memory_params, + strategy_params=strategy_params, state_dim=state_dim, + action_dim=action_dim, ctx=ctx, discount_factor=discount_factor, + training_episodes=training_episodes, start_training=start_training, + train_interval=train_interval, + snapshot_interval=snapshot_interval, agent_name=agent_name, + max_episode_step=max_episode_step, + output_directory=output_directory, verbose=verbose, + target_score=target_score, evaluation_samples=evaluation_samples) + + self._qnet = qnet + self._target_update_interval = target_update_interval + self._target_qnet = copy_net( + self._qnet, self._state_dim, ctx=self._ctx) + self._loss_function_str = loss_function + self._loss_function = get_loss_function(loss_function) + self._optimizer = optimizer + self._optimizer_params = optimizer_params + self._double_dqn = double_dqn + self._use_fix_target = use_fix_target + + # Initialize best network + self._best_net = copy_net(self._qnet, self._state_dim, self._ctx) + self._best_avg_score = -np.infty + + self._training_stats = None + + @classmethod + def resume_from_session(cls, session_dir, net, environment): + import pickle + if not os.path.exists(session_dir): + raise ValueError('Session directory does not exist') + + files = dict() + files['agent'] = os.path.join(session_dir, 'agent.p') + files['best_net_params'] = os.path.join(session_dir, 'best_net.params') + files['q_net_params'] = os.path.join(session_dir, 'qnet.params') + files['target_net_params'] = os.path.join( + session_dir, 'target_net.params') + + for file in files.values(): + if not os.path.exists(file): + raise ValueError( + 'Session directory is not complete: {} is missing' + .format(file)) + + with open(files['agent'], 'rb') as f: + agent = pickle.load(f) + + agent._environment = environment + agent._qnet = net + agent._qnet.load_parameters(files['q_net_params'], agent._ctx) + agent._qnet.hybridize() + agent._qnet(nd.random_normal( + shape=((1,) + agent._state_dim), ctx=agent._ctx)) + agent._best_net = copy_net(agent._qnet, agent._state_dim, agent._ctx) + agent._best_net.load_parameters(files['best_net_params'], agent._ctx) + agent._target_qnet = copy_net( + agent._qnet, agent._state_dim, agent._ctx) + agent._target_qnet.load_parameters( + files['target_net_params'], agent._ctx) + + agent._logger = ArchLogger.get_logger() + agent._training_stats.logger = agent._logger + agent._logger.info('Agent was retrieved; Training can be continued') + + return agent + + def _make_pickle_ready(self, session_dir): + super(DqnAgent, self)._make_pickle_ready(session_dir) + self._save_net(self._qnet, 'qnet', session_dir) + self._qnet = None + self._save_net(self._target_qnet, 'target_net', session_dir) + self._target_qnet = None + + def get_q_values(self, state, with_best=False): + return self.get_batch_q_values( + nd.array([state], ctx=self._ctx), with_best=with_best)[0] + + def get_batch_q_values(self, state_batch, with_best=False): + return self._best_net(state_batch)\ + if with_best else self._qnet(state_batch) + + def get_next_action(self, state, with_best=False): + q_values = self.get_q_values(state, with_best=with_best) + action = q_values.asnumpy().argmax() + return action + + def __determine_target_q_values( + self, states, actions, rewards, next_states, terminals + ): + if self._use_fix_target: + q_max_val = self._target_qnet(next_states) + else: + q_max_val = self._qnet(next_states) + + if self._double_dqn: + q_values_next_states = self._qnet(next_states) + target_rewards = rewards + nd.choose_element_0index( + q_max_val, nd.argmax_channel(q_values_next_states))\ + * (1.0 - terminals) * self._discount_factor + else: + target_rewards = rewards + nd.choose_element_0index( + q_max_val, nd.argmax_channel(q_max_val))\ + * (1.0 - terminals) * self._discount_factor + + target_qval = self._qnet(states) + for t in range(target_rewards.shape[0]): + target_qval[t][actions[t]] = target_rewards[t] + + return target_qval + + def __train_q_net_step(self, trainer): + states, actions, rewards, next_states, terminals =\ + self._sample_from_memory() + target_qval = self.__determine_target_q_values( + states, actions, rewards, next_states, terminals) + with autograd.record(): + q_values = self._qnet(states) + loss = self._loss_function(q_values, target_qval) + loss.backward() + trainer.step(self._minibatch_size) + return loss + + def __do_target_update_if_in_interval(self, total_steps): + do_target_update = ( + self._use_fix_target and + (total_steps % self._target_update_interval == 0)) + if do_target_update: + self._logger.info( + 'Target network is updated after {} steps'.format(total_steps)) + self._target_qnet = copy_net( + self._qnet, self._state_dim, self._ctx) + + def train(self, episodes=None): + self.save_config_file() + self._logger.info("--- Start training ---") + trainer = gluon.Trainer( + self._qnet.collect_params(), + self._optimizer, + self._adjust_optimizer_params(self._optimizer_params)) + episodes = episodes if episodes is not None\ + else self._training_episodes + + resume = (self._current_episode > 0) + if resume: + self._logger.info("Training session resumed") + self._logger.info("Starting from episode {}".format( + self._current_episode)) + else: + self._training_stats = DqnTrainingStats(episodes) + + # Implementation Deep Q Learning described by + # Mnih et. al. in Playing Atari with Deep Reinforcement Learning + while self._current_episode < episodes: + if self._check_interrupt_routine(): + return False + + step = 0 + episode_reward = 0 + start = time.time() + state = self._environment.reset() + episode_loss = 0 + training_steps = 0 + while step < self._max_episode_step: + # 1. Choose an action based on current game state and policy + q_values = self._qnet(nd.array([state], ctx=self._ctx)) + action = self._strategy.select_action(q_values[0]) + + # 2. Play the game for a single step + next_state, reward, terminal, _ =\ + self._environment.step(action) + + # 3. Store transition in replay memory + self._memory.append( + state, action, reward, next_state, terminal) + + # 4. Train the network if in interval + if self._do_training(): + loss = self.__train_q_net_step(trainer) + training_steps += 1 + episode_loss +=\ + np.sum(loss.asnumpy()) / self._minibatch_size + + # Update target network if in interval + self.__do_target_update_if_in_interval(self._total_steps) + + step += 1 + self._total_steps += 1 + episode_reward += reward + state = next_state + + if terminal: + self._strategy.reset() + break + + self._do_snapshot_if_in_interval(self._current_episode) + + episode_loss = (episode_loss / training_steps)\ + if training_steps > 0 else 0 + avg_reward = self._training_stats.log_episode( + self._current_episode, start, training_steps, + episode_loss, self._strategy.cur_eps, episode_reward) + + self._strategy.decay(self._current_episode) + + if self._is_target_reached(avg_reward): + self._logger.info( + 'Target score is reached in average; Training is stopped') + break + + self._current_episode += 1 + + self._evaluate() + self.save_parameters(episode=self._current_episode) + self.save_best_network(os.path.join(self._output_directory, 'best')) + self._training_stats.save_stats(self._output_directory) + self._logger.info('--------- Training finished ---------') + return True + + def _make_config_dict(self): + config = super(DqnAgent, self)._make_config_dict() + config['optimizer'] = self._optimizer + config['optimizer_params'] = self._optimizer_params + config['loss_function'] = self._loss_function_str + config['use_fix_target'] = self._use_fix_target + config['double_dqn'] = self._double_dqn + config['target_update_interval'] = self._target_update_interval + return config + + def save_parameters(self, episode): + self._save_parameters(self._qnet, episode=episode) + + def _save_current_as_best_net(self): + self._best_net = copy_net( + self._qnet, (1,) + self._state_dim, ctx=self._ctx) diff --git a/src/test/resources/target_code/ddpg/reinforcement_learning/cnnarch_logger.py b/src/test/resources/target_code/ddpg/reinforcement_learning/cnnarch_logger.py new file mode 100644 index 0000000..19f36b1 --- /dev/null +++ b/src/test/resources/target_code/ddpg/reinforcement_learning/cnnarch_logger.py @@ -0,0 +1,95 @@ +import logging +import sys +import os +import util + + +class ArchLogger(object): + _logger = None + + __output_level = logging.INFO + __logger_name = 'agent' + __output_directory = '.' + __append = True + __logformat = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + __dateformat = '%d-%b-%y %H:%M:%S' + + INFO = logging.INFO + DEBUG = logging.DEBUG + + @staticmethod + def set_output_level(output_level): + assert output_level is not None + ArchLogger.__output_level = output_level + + @staticmethod + def set_logger_name(logger_name): + assert logger_name is not None + ArchLogger.__logger_name = logger_name + + @staticmethod + def set_output_directory(output_directory): + assert output_directory is not None + ArchLogger.__output_directory = output_directory + + @staticmethod + def set_append(append): + assert append is not None + ArchLogger.__append = append + + @staticmethod + def set_log_format(logformat, dateformat): + assert logformat is not None + assert dateformat is not None + ArchLogger.__logformat = logformat + ArchLogger.__dateformat = dateformat + + @staticmethod + def init_logger(make_log_file=True): + assert ArchLogger._logger is None, 'Logger init already called' + filemode = 'a' if ArchLogger.__append else 'w' + formatter = logging.Formatter( + fmt=ArchLogger.__logformat, datefmt=ArchLogger.__dateformat) + + logger = logging.getLogger(ArchLogger.__logger_name) + logger.propagate = False + + if not logger.handlers: + logger.setLevel(ArchLogger.__output_level) + stream_handler = logging.StreamHandler(sys.stdout) + stream_handler.setLevel(ArchLogger.__output_level) + stream_handler.setFormatter(formatter) + logger.addHandler(stream_handler) + + if make_log_file: + util.make_directory_if_not_exist(ArchLogger.__output_directory) + log_file = os.path.join( + ArchLogger.__output_directory, + ArchLogger.__logger_name + '.log') + file_handler = logging.FileHandler(log_file, mode=filemode) + file_handler.setLevel(ArchLogger.__output_level) + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + ArchLogger._logger = logger + + @staticmethod + def get_logger(): + if ArchLogger._logger is None: + ArchLogger.init_logger() + assert ArchLogger._logger is not None + return ArchLogger._logger + +if __name__ == "__main__": + print('=== Test logger ===') + ArchLogger.set_logger_name('TestLogger') + ArchLogger.set_output_directory('test_log') + ArchLogger.init_logger() + logger = ArchLogger.get_logger() + logger.warning('This is a warning') + logger.debug('This is a debug information, which you should not see') + logger.info('This is a normal information') + assert os.path.exists('test_log')\ + and os.path.isfile(os.path.join('test_log', 'TestLogger.log')),\ + 'Test failed: No logfile exists' + import shutil + shutil.rmtree('test_log') \ No newline at end of file diff --git a/src/test/resources/target_code/ddpg/reinforcement_learning/environment.py b/src/test/resources/target_code/ddpg/reinforcement_learning/environment.py new file mode 100644 index 0000000..381ec9e --- /dev/null +++ b/src/test/resources/target_code/ddpg/reinforcement_learning/environment.py @@ -0,0 +1,64 @@ +import abc +import logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +class Environment: + __metaclass__ = abc.ABCMeta + + def __init__(self): + pass + + @abc.abstractmethod + def reset(self): + pass + + @abc.abstractmethod + def step(self, action): + pass + + @abc.abstractmethod + def close(self): + pass + +import gym +class GymEnvironment(Environment): + def __init__(self, env_name, **kwargs): + super(GymEnvironment, self).__init__(**kwargs) + self.__seed = 42 + self.__env = gym.make(env_name) + self.__env.seed(self.__seed) + + @property + def state_dim(self): + return self.__env.observation_space.shape + + + @property + def number_of_actions(self): + return self.__env.action_space.n + + @property + def rewards_dtype(self): + return 'float32' + + def reset(self): + return self.__env.reset() + + def step(self, action): + return self.__env.step(action) + + def close(self): + self.__env.close() + + def action_space(self): + self.__env.action_space + + def is_in_action_space(self, action): + return self.__env.action_space.contains(action) + + def sample_action(self): + return self.__env.action_space.sample() + + def render(self): + self.__env.render() diff --git a/src/test/resources/target_code/ddpg/reinforcement_learning/replay_memory.py b/src/test/resources/target_code/ddpg/reinforcement_learning/replay_memory.py new file mode 100644 index 0000000..d22147a --- /dev/null +++ b/src/test/resources/target_code/ddpg/reinforcement_learning/replay_memory.py @@ -0,0 +1,208 @@ +import numpy as np + + +class ReplayMemoryBuilder(object): + def __init__(self): + self.__supported_methods = ['online', 'buffer', 'combined'] + + def build_by_params( + self, + state_dim, + method='online', + state_dtype='float32', + action_dim=(1,), + action_dtype='uint8', + rewards_dtype='float32', + memory_size=1000, + sample_size=32 + ): + assert state_dim is not None + assert action_dim is not None + assert method in self.__supported_methods + + if method == 'online': + return self.build_online_memory( + state_dim=state_dim, state_dtype=state_dtype, + action_dtype=action_dtype, action_dim=action_dim, + rewards_dtype=rewards_dtype) + else: + assert memory_size is not None and memory_size > 0 + assert sample_size is not None and sample_size > 0 + if method == 'buffer': + return self.build_buffered_memory( + state_dim=state_dim, sample_size=sample_size, + memory_size=memory_size, state_dtype=state_dtype, + action_dim=action_dim, action_dtype=action_dtype, + rewards_dtype=rewards_dtype) + else: + return self.build_combined_memory( + state_dim=state_dim, sample_size=sample_size, + memory_size=memory_size, state_dtype=state_dtype, + action_dim=action_dim, action_dtype=action_dtype, + rewards_dtype=rewards_dtype) + + def build_buffered_memory( + self, state_dim, memory_size, sample_size, state_dtype, action_dim, + action_dtype, rewards_dtype + ): + assert memory_size > 0 + assert sample_size > 0 + return ReplayMemory( + state_dim, size=memory_size, sample_size=sample_size, + state_dtype=state_dtype, action_dim=action_dim, + action_dtype=action_dtype, rewards_dtype=rewards_dtype) + + def build_combined_memory( + self, state_dim, memory_size, sample_size, state_dtype, action_dim, + action_dtype, rewards_dtype + ): + assert memory_size > 0 + assert sample_size > 0 + return CombinedReplayMemory( + state_dim, size=memory_size, sample_size=sample_size, + state_dtype=state_dtype, action_dim=action_dim, + action_dtype=action_dtype, rewards_dtype=rewards_dtype) + + def build_online_memory( + self, state_dim, state_dtype, action_dtype, action_dim, rewards_dtype + ): + return OnlineReplayMemory( + state_dim, state_dtype=state_dtype, action_dim=action_dim, + action_dtype=action_dtype, rewards_dtype=rewards_dtype) + + +class ReplayMemory(object): + def __init__( + self, + state_dim, + sample_size, + size=1000, + action_dim=(1,), + state_dtype='float32', + action_dtype='uint8', + rewards_dtype='float32' + ): + assert size > 0, "Size must be greater than zero" + assert type(state_dim) is tuple, "State dimension must be a tuple" + assert type(action_dim) is tuple, "Action dimension must be a tuple" + assert sample_size > 0 + self._size = size + self._sample_size = sample_size + self._cur_size = 0 + self._pointer = 0 + self._state_dim = state_dim + self._state_dtype = state_dtype + self._action_dim = action_dim + self._action_dtype = action_dtype + self._rewards_dtype = rewards_dtype + self._states = np.zeros((self._size,) + state_dim, dtype=state_dtype) + self._actions = np.zeros( + (self._size,) + action_dim, dtype=action_dtype) + self._rewards = np.array([0] * self._size, dtype=rewards_dtype) + self._next_states = np.zeros( + (self._size,) + state_dim, dtype=state_dtype) + self._terminals = np.array([0] * self._size, dtype='bool') + + @property + def sample_size(self): + return self._sample_size + + def append(self, state, action, reward, next_state, terminal): + self._states[self._pointer] = state + self._actions[self._pointer] = action + self._rewards[self._pointer] = reward + self._next_states[self._pointer] = next_state + self._terminals[self._pointer] = terminal + + self._pointer = self._pointer + 1 + if self._pointer == self._size: + self._pointer = 0 + + self._cur_size = min(self._size, self._cur_size + 1) + + def at(self, index): + return self._states[index],\ + self._actions[index],\ + self._rewards[index],\ + self._next_states[index],\ + self._terminals[index] + + def is_sample_possible(self, batch_size=None): + batch_size = batch_size if batch_size is not None\ + else self._sample_size + return self._cur_size >= batch_size + + def sample(self, batch_size=None): + batch_size = batch_size if batch_size is not None\ + else self._sample_size + assert self._cur_size >= batch_size,\ + "Size of replay memory must be larger than batch size" + i = 0 + states = np.zeros(( + batch_size,)+self._state_dim, dtype=self._state_dtype) + actions = np.zeros( + (batch_size,)+self._action_dim, dtype=self._action_dtype) + rewards = np.zeros(batch_size, dtype=self._rewards_dtype) + next_states = np.zeros( + (batch_size,)+self._state_dim, dtype=self._state_dtype) + terminals = np.zeros(batch_size, dtype='bool') + + while i < batch_size: + rnd_index = np.random.randint(low=0, high=self._cur_size) + states[i] = self._states.take(rnd_index, axis=0) + actions[i] = self._actions.take(rnd_index, axis=0) + rewards[i] = self._rewards.take(rnd_index, axis=0) + next_states[i] = self._next_states.take(rnd_index, axis=0) + terminals[i] = self._terminals.take(rnd_index, axis=0) + i += 1 + + return states, actions, rewards, next_states, terminals + + +class OnlineReplayMemory(ReplayMemory): + def __init__( + self, state_dim, state_dtype='float32', action_dim=(1,), + action_dtype='uint8', rewards_dtype='float32' + ): + super(OnlineReplayMemory, self).__init__( + state_dim, sample_size=1, size=1, state_dtype=state_dtype, + action_dim=action_dim, action_dtype=action_dtype, + rewards_dtype=rewards_dtype) + + +class CombinedReplayMemory(ReplayMemory): + def __init__( + self, state_dim, sample_size, size=1000, state_dtype='float32', + action_dim=(1,), action_dtype='uint8', rewards_dtype='float32' + ): + super(CombinedReplayMemory, self).__init__( + state_dim=state_dim, sample_size=(sample_size - 1), size=size, + state_dtype=state_dtype, action_dim=action_dim, + action_dtype=action_dtype, rewards_dtype=rewards_dtype) + + self._last_state = np.zeros((1,) + state_dim, dtype=state_dtype) + self._last_action = np.array((1,) + action_dim, dtype=action_dtype) + self._last_reward = np.array([0], dtype=rewards_dtype) + self._last_next_state = np.zeros((1,) + state_dim, dtype=state_dtype) + self._last_terminal = np.array([0], dtype='bool') + + def append(self, state, action, reward, next_state, terminal): + super(CombinedReplayMemory, self).append( + state, action, reward, next_state, terminal) + self._last_state = state + self._last_action = action + self._last_reward = reward + self._last_next_state = next_state + self._last_terminal = terminal + + def sample(self, batch_size=None): + batch_size = (batch_size-1) if batch_size is not None\ + else self._sample_size + states, actions, rewards, next_states, terminals = super( + CombinedReplayMemory, self).sample(batch_size=batch_size) + states = np.append(states, [self._last_state], axis=0) + actions = np.append(actions, [self._last_action], axis=0) + rewards = np.append(rewards, [self._last_reward], axis=0) + next_states = np.append(next_states, [self._last_next_state], axis=0) + terminals = np.append(terminals, [self._last_terminal], axis=0) + return states, actions, rewards, next_states, terminals \ No newline at end of file diff --git a/src/test/resources/target_code/ddpg/reinforcement_learning/strategy.py b/src/test/resources/target_code/ddpg/reinforcement_learning/strategy.py new file mode 100644 index 0000000..4c47adc --- /dev/null +++ b/src/test/resources/target_code/ddpg/reinforcement_learning/strategy.py @@ -0,0 +1,172 @@ +import numpy as np + + +class StrategyBuilder(object): + def __init__(self): + pass + + def build_by_params( + self, + method='epsgreedy', + epsilon=0.5, + min_epsilon=0.05, + epsilon_decay_method='no', + epsilon_decay=0.0, + epsilon_decay_start=0, + action_dim=None, + action_low=None, + action_high=None, + mu=0.0, + theta=0.5, + sigma=0.3 + ): + + if epsilon_decay_method == 'linear': + decay = LinearDecay( + eps_decay=epsilon_decay, min_eps=min_epsilon, + decay_start=epsilon_decay_start) + else: + decay = NoDecay() + + if method == 'epsgreedy': + assert action_dim is not None + assert len(action_dim) == 1 + return EpsilonGreedyStrategy( + eps=epsilon, number_of_actions=action_dim[0], + decay_method=decay) + elif method == 'ornstein_uhlenbeck': + assert action_dim is not None + assert action_low is not None + assert action_high is not None + assert mu is not None + assert theta is not None + assert sigma is not None + return OrnsteinUhlenbeckStrategy( + action_dim, action_low, action_high, epsilon, mu, theta, + sigma, decay) + else: + assert action_dim is not None + assert len(action_dim) == 1 + return GreedyStrategy() + + +class BaseDecay(object): + def __init__(self): + pass + + def decay(self, *args): + raise NotImplementedError + + def __call__(self, *args): + return self.decay(*args) + + +class NoDecay(BaseDecay): + def __init__(self): + super(NoDecay, self).__init__() + + def decay(self, cur_eps, episode): + return cur_eps + + +class LinearDecay(BaseDecay): + def __init__(self, eps_decay, min_eps=0, decay_start=0): + super(LinearDecay, self).__init__() + self.eps_decay = eps_decay + self.min_eps = min_eps + self.decay_start = decay_start + + def decay(self, cur_eps, episode): + if episode < self.decay_start: + return cur_eps + else: + return max(cur_eps - self.eps_decay, self.min_eps) + + +class BaseStrategy(object): + def __init__(self, decay_method): + self._decay_method = decay_method + + def select_action(self, values, decay_method): + raise NotImplementedError + + def decay(self, episode): + self.cur_eps = self._decay_method.decay(self.cur_eps, episode) + + def reset(self): + pass + + +class EpsilonGreedyStrategy(BaseStrategy): + def __init__(self, eps, number_of_actions, decay_method): + super(EpsilonGreedyStrategy, self).__init__(decay_method) + self.eps = eps + self.cur_eps = eps + self.__number_of_actions = number_of_actions + + def select_action(self, values): + do_exploration = (np.random.rand() < self.cur_eps) + if do_exploration: + action = np.random.randint(low=0, high=self.__number_of_actions) + else: + action = values.asnumpy().argmax() + return action + + +class GreedyStrategy(BaseStrategy): + def __init__(self): + super(GreedyStrategy, self).__init__(None) + + def select_action(self, values): + return values.asnumpy().argmax() + + def decay(self): + pass + + +class OrnsteinUhlenbeckStrategy(BaseStrategy): + """ + Ornstein-Uhlenbeck process: dxt = theta * (mu - xt) * dt + sigma * dWt + where Wt denotes the Wiener process. + """ + def __init__( + self, + action_dim, + action_low, + action_high, + eps, + mu=0.0, + theta=.15, + sigma=.3, + decay=NoDecay() + ): + super(OrnsteinUhlenbeckStrategy, self).__init__(decay) + self.eps = eps + self.cur_eps = eps + + self._decay_method = decay + + self._action_dim = action_dim + self._action_low = action_low + self._action_high = action_high + + self._mu = np.array(mu) + self._theta = np.array(theta) + self._sigma = np.array(sigma) + + self.state = np.ones(self._action_dim) * self._mu + + def _evolve_state(self): + x = self.state + dx = self._theta * (self._mu - x)\ + + self._sigma * np.random.randn(len(x)) + self.state = x + dx + return self.state + + def reset(self): + self.state = np.ones(self._action_dim) * self._mu + + def select_action(self, values): + noise = self._evolve_state() + action = values + (self.cur_eps * noise) + return np.clip(action, self._action_low, self._action_high) diff --git a/src/test/resources/target_code/ddpg/reinforcement_learning/util.py b/src/test/resources/target_code/ddpg/reinforcement_learning/util.py new file mode 100644 index 0000000..a1da1ce --- /dev/null +++ b/src/test/resources/target_code/ddpg/reinforcement_learning/util.py @@ -0,0 +1,276 @@ +import signal +import sys +import numpy as np +import matplotlib.pyplot as plt +from matplotlib import style +import time +import os +import mxnet as mx +from mxnet import gluon, nd +import cnnarch_logger + +LOSS_FUNCTIONS = { + 'l1': gluon.loss.L1Loss(), + 'euclidean': gluon.loss.L2Loss(), + 'huber_loss': gluon.loss.HuberLoss(), + 'softmax_cross_entropy': gluon.loss.SoftmaxCrossEntropyLoss(), + 'sigmoid_cross_entropy': gluon.loss.SigmoidBinaryCrossEntropyLoss()} + + +def make_directory_if_not_exist(directory): + if not os.path.exists(directory): + os.makedirs(directory) + + +def copy_net(net, input_state_dim, ctx): + assert isinstance(net, gluon.HybridBlock) + assert type(net.__class__) is type + + net2 = net.__class__() + net2.collect_params().initialize(mx.init.Zero(), ctx=ctx) + net2.hybridize() + net2(mx.nd.ones((1,) + input_state_dim, ctx=ctx)) + + params_of_net = [p.data() for _, p in net.collect_params().items()] + for i, (_, p) in enumerate(net2.collect_params().items()): + p.set_data(params_of_net[i]) + + return net2 + + +def copy_net_with_two_inputs(net, input_state_dim1, input_state_dim2, ctx): + assert isinstance(net, gluon.HybridBlock) + assert type(net.__class__) is type + + net2 = net.__class__() + net2.collect_params().initialize(mx.init.Zero(), ctx=ctx) + net2.hybridize() + net2( + nd.ones((1,) + input_state_dim1, ctx=ctx), + nd.ones((1,) + input_state_dim2, ctx=ctx)) + + params_of_net = [p.data() for _, p in net.collect_params().items()] + for i, (_, p) in enumerate(net2.collect_params().items()): + p.set_data(params_of_net[i]) + + return net2 + + +def get_loss_function(loss_function_name): + if loss_function_name not in LOSS_FUNCTIONS: + raise ValueError('Loss function does not exist') + return LOSS_FUNCTIONS[loss_function_name] + + +class AgentSignalHandler(object): + def __init__(self): + signal.signal(signal.SIGINT, self.interrupt_training) + self.__agent = None + self.__times_interrupted = 0 + + def register_agent(self, agent): + self.__agent = agent + + def interrupt_training(self, sig, frame): + self.__times_interrupted = self.__times_interrupted + 1 + if self.__times_interrupted <= 3: + if self.__agent: + self.__agent.set_interrupt_flag(True) + else: + print('Interrupt called three times: Force quit') + sys.exit(1) + +style.use('fivethirtyeight') + + +class TrainingStats(object): + def __init__(self, max_episodes): + self._logger = cnnarch_logger.ArchLogger.get_logger() + self._max_episodes = max_episodes + self._all_total_rewards = np.zeros((max_episodes,)) + self._all_eps = np.zeros((max_episodes,)) + self._all_time = np.zeros((max_episodes,)) + self._all_mean_reward_last_100_episodes = np.zeros((max_episodes,)) + + @property + def logger(self): + return self._logger + + @logger.setter + def logger(self, logger): + self._logger = logger + + @logger.deleter + def logger(self): + self._logger = None + + def add_total_reward(self, episode, total_reward): + self._all_total_rewards[episode] = total_reward + + def add_eps(self, episode, eps): + self._all_eps[episode] = eps + + def add_time(self, episode, time): + self._all_time[episode] = time + + def add_mean_reward_last_100(self, episode, mean_reward): + self._all_mean_reward_last_100_episodes[episode] = mean_reward + + def log_episode(self, *args): + raise NotImplementedError + + def mean_of_reward(self, cur_episode, last=100): + if cur_episode > 0: + reward_last_100 =\ + self._all_total_rewards[max(0, cur_episode-last):cur_episode] + return np.mean(reward_last_100) + else: + return self._all_total_rewards[0] + + def save(self, path): + np.save(os.path.join(path, 'total_rewards'), self._all_total_rewards) + np.save(os.path.join(path, 'eps'), self._all_eps) + np.save(os.path.join(path, 'time'), self._all_time) + np.save( + os.path.join(path, 'mean_reward'), + self._all_mean_reward_last_100_episodes) + + def _log_episode(self, episode, start_time, training_steps, eps, reward): + self.add_eps(episode, eps) + self.add_total_reward(episode, reward) + end = time.time() + mean_reward_last_100 = self.mean_of_reward(episode, last=100) + time_elapsed = end - start_time + self.add_time(episode, time_elapsed) + self.add_mean_reward_last_100(episode, mean_reward_last_100) + return ('Episode: %d, Total Reward: %.3f, ' + 'Avg. Reward Last 100 Episodes: %.3f, {}, ' + 'Time: %.3f, Training Steps: %d, Eps: %.3f') % ( + episode, reward, mean_reward_last_100, time_elapsed, + training_steps, eps), mean_reward_last_100 + + +class DqnTrainingStats(TrainingStats): + def __init__(self, max_episodes): + super(DqnTrainingStats, self).__init__(max_episodes) + self._all_avg_loss = np.zeros((max_episodes,)) + + def add_avg_loss(self, episode, avg_loss): + self._all_avg_loss[episode] = avg_loss + + def log_episode( + self, episode, start_time, training_steps, avg_loss, eps, reward + ): + self.add_avg_loss(episode, avg_loss) + + info, avg_reward = self._log_episode( + episode, start_time, training_steps, eps, reward) + info = info.format(('Avg. Loss: %.3f') % (avg_loss)) + + self._logger.info(info) + return avg_reward + + def save_stats(self, path): + fig = plt.figure(figsize=(20, 20)) + + sub_rewards = fig.add_subplot(221) + sub_rewards.set_title('Total Rewards per episode') + sub_rewards.plot( + np.arange(self._max_episodes), self._all_total_rewards) + + sub_loss = fig.add_subplot(222) + sub_loss.set_title('Avg. Loss per episode') + sub_loss.plot(np.arange(self._max_episodes), self._all_avg_loss) + + sub_eps = fig.add_subplot(223) + sub_eps.set_title('Epsilon per episode') + sub_eps.plot(np.arange(self._max_episodes), self._all_eps) + + sub_rewards = fig.add_subplot(224) + sub_rewards.set_title('Avg. mean reward of last 100 episodes') + sub_rewards.plot(np.arange(self._max_episodes), + self._all_mean_reward_last_100_episodes) + + self.save(path) + plt.savefig(os.path.join(path, 'stats.pdf')) + + def save(self, path): + super(DqnTrainingStats, self).save(path) + np.save(os.path.join(path, 'avg_loss'), self._all_avg_loss) + + +class DdpgTrainingStats(TrainingStats): + def __init__(self, max_episodes): + super(DdpgTrainingStats, self).__init__(max_episodes) + self._all_avg_critic_loss = np.zeros((max_episodes,)) + self._all_avg_actor_loss = np.zeros((max_episodes,)) + self._all_avg_qvalues = np.zeros((max_episodes,)) + + def add_avg_critic_loss(self, episode, avg_critic_loss): + self._all_avg_critic_loss[episode] = avg_critic_loss + + def add_avg_actor_loss(self, episode, avg_actor_loss): + self._all_avg_actor_loss[episode] = avg_actor_loss + + def add_avg_qvalues(self, episode, avg_qvalues): + self._all_avg_qvalues[episode] = avg_qvalues + + def log_episode( + self, episode, start_time, training_steps, actor_loss, + critic_loss, qvalues, eps, reward + ): + self.add_avg_actor_loss(episode, actor_loss) + self.add_avg_critic_loss(episode, critic_loss) + self.add_avg_qvalues(episode, qvalues) + + info, avg_reward = self._log_episode( + episode, start_time, training_steps, eps, reward) + info = info.format(( + 'Avg. Actor Loss: %.3f ' + 'Avg. Critic Loss: %.3f ' + 'Avg. Q-Values: %.3f') % (actor_loss, critic_loss, qvalues)) + + self.logger.info(info) + return avg_reward + + def save(self, path): + super(DdpgTrainingStats, self).save(path) + np.save(os.path.join( + path, 'avg_critic_loss'), self._all_avg_critic_loss) + np.save(os.path.join(path, 'avg_actor_loss'), self._all_avg_actor_loss) + np.save(os.path.join(path, 'avg_qvalues'), self._all_avg_qvalues) + + def save_stats(self, path): + fig = plt.figure(figsize=(120, 120)) + + sub_rewards = fig.add_subplot(321) + sub_rewards.set_title('Total Rewards per episode') + sub_rewards.plot( + np.arange(self._max_episodes), self._all_total_rewards) + + sub_actor_loss = fig.add_subplot(322) + sub_actor_loss.set_title('Avg. Actor Loss per episode') + sub_actor_loss.plot( + np.arange(self._max_episodes), self._all_avg_actor_loss) + + sub_critic_loss = fig.add_subplot(323) + sub_critic_loss.set_title('Avg. Critic Loss per episode') + sub_critic_loss.plot( + np.arange(self._max_episodes), self._all_avg_critic_loss) + + sub_qvalues = fig.add_subplot(324) + sub_qvalues.set_title('Avg. QValues per episode') + sub_qvalues.plot( + np.arange(self._max_episodes), self._all_avg_qvalues) + + sub_eps = fig.add_subplot(325) + sub_eps.set_title('Epsilon per episode') + sub_eps.plot(np.arange(self._max_episodes), self._all_eps) + + sub_rewards = fig.add_subplot(326) + sub_rewards.set_title('Avg. mean reward of last 100 episodes') + sub_rewards.plot(np.arange(self._max_episodes), + self._all_mean_reward_last_100_episodes) + + self.save(path) + plt.savefig(os.path.join(path, 'stats.pdf')) diff --git a/src/test/resources/target_code/ddpg/start_training.sh b/src/test/resources/target_code/ddpg/start_training.sh new file mode 100644 index 0000000..028a6ae --- /dev/null +++ b/src/test/resources/target_code/ddpg/start_training.sh @@ -0,0 +1,2 @@ +#!/bin/bash +python CNNTrainer_actorNetwork.py \ No newline at end of file diff --git a/src/test/resources/target_code/reinforcement_learning/action_policy.py b/src/test/resources/target_code/reinforcement_learning/action_policy.py deleted file mode 100644 index f43a211..0000000 --- a/src/test/resources/target_code/reinforcement_learning/action_policy.py +++ /dev/null @@ -1,73 +0,0 @@ -import numpy as np - -class ActionPolicyBuilder(object): - def __init__(self): - pass - - def build_by_params(self, - method='epsgreedy', - epsilon=0.5, - min_epsilon=0.05, - epsilon_decay_method='no', - epsilon_decay=0.0, - action_dim=None): - - if epsilon_decay_method == 'linear': - decay = LinearDecay(eps_decay=epsilon_decay, min_eps=min_epsilon) - else: - decay = NoDecay() - - if method == 'epsgreedy': - assert action_dim is not None - assert len(action_dim) == 1 - return EpsilonGreedyActionPolicy(eps=epsilon, - number_of_actions=action_dim[0], decay=decay) - else: - assert action_dim is not None - assert len(action_dim) == 1 - return GreedyActionPolicy() - -class EpsilonGreedyActionPolicy(object): - def __init__(self, eps, number_of_actions, decay): - self.eps = eps - self.cur_eps = eps - self.__number_of_actions = number_of_actions - self.__decay_method = decay - - def select_action(self, values): - do_exploration = (np.random.rand() < self.cur_eps) - if do_exploration: - action = np.random.randint(low=0, high=self.__number_of_actions) - else: - action = values.asnumpy().argmax() - return action - - def decay(self): - self.cur_eps = self.__decay_method.decay(self.cur_eps) - - -class GreedyActionPolicy(object): - def __init__(self): - pass - - def select_action(self, values): - return values.asnumpy().argmax() - - def decay(self): - pass - - -class NoDecay(object): - def __init__(self): - pass - - def decay(self, cur_eps): - return cur_eps - -class LinearDecay(object): - def __init__(self, eps_decay, min_eps=0): - self.eps_decay = eps_decay - self.min_eps = min_eps - - def decay(self, cur_eps): - return max(cur_eps - self.eps_decay, self.min_eps) \ No newline at end of file diff --git a/src/test/resources/target_code/reinforcement_learning/strategy.py b/src/test/resources/target_code/reinforcement_learning/strategy.py new file mode 100644 index 0000000..ee4549d --- /dev/null +++ b/src/test/resources/target_code/reinforcement_learning/strategy.py @@ -0,0 +1,172 @@ +import numpy as np + + +class StrategyBuilder(object): + def __init__(self): + pass + + def build_by_params( + self, + method='epsgreedy', + epsilon=0.5, + min_epsilon=0.05, + epsilon_decay_method='no', + epsilon_decay=0.0, + epsilon_decay_start=0, + action_dim=None, + action_low=None, + action_high=None, + mu=0.0, + theta=0.5, + sigma=0.3 + ): + + if epsilon_decay_method == 'linear': + decay = LinearDecay( + eps_decay=epsilon_decay, min_eps=min_epsilon, + decay_start=epsilon_decay_start) + else: + decay = NoDecay() + + if method == 'epsgreedy': + assert action_dim is not None + assert len(action_dim) == 1 + return EpsilonGreedyStrategy( + eps=epsilon, number_of_actions=action_dim[0], + decay_method=decay) + elif method == 'ornstein_uhlenbeck': + assert action_dim is not None + assert action_low is not None + assert action_high is not None + assert mu is not None + assert theta is not None + assert sigma is not None + return OrnsteinUhlenbeckStrategy( + action_dim, action_low, action_high, epsilon, mu, theta, + sigma, decay) + else: + assert action_dim is not None + assert len(action_dim) == 1 + return GreedyStrategy() + + +class BaseDecay(object): + def __init__(self): + pass + + def decay(self, *args): + raise NotImplementedError + + def __call__(self, *args): + return self.decay(*args) + + +class NoDecay(BaseDecay): + def __init__(self): + super(NoDecay, self).__init__() + + def decay(self, cur_eps, episode): + return cur_eps + + +class LinearDecay(BaseDecay): + def __init__(self, eps_decay, min_eps=0, decay_start=0): + super(LinearDecay, self).__init__() + self.eps_decay = eps_decay + self.min_eps = min_eps + self.decay_start = decay_start + + def decay(self, cur_eps, episode): + if episode < self.decay_start: + return cur_eps + else: + return max(cur_eps - self.eps_decay, self.min_eps) + + +class BaseStrategy(object): + def __init__(self, decay_method): + self._decay_method = decay_method + + def select_action(self, values, decay_method): + raise NotImplementedError + + def decay(self, episode): + self.cur_eps = self._decay_method.decay(self.cur_eps, episode) + + def reset(self): + pass + + +class EpsilonGreedyStrategy(BaseStrategy): + def __init__(self, eps, number_of_actions, decay_method): + super(EpsilonGreedyStrategy, self).__init__(decay_method) + self.eps = eps + self.cur_eps = eps + self.__number_of_actions = number_of_actions + + def select_action(self, values): + do_exploration = (np.random.rand() < self.cur_eps) + if do_exploration: + action = np.random.randint(low=0, high=self.__number_of_actions) + else: + action = values.asnumpy().argmax() + return action + + +class GreedyStrategy(BaseStrategy): + def __init__(self): + super(GreedyStrategy, self).__init__(None) + + def select_action(self, values): + return values.asnumpy().argmax() + + def decay(self): + pass + + +class OrnsteinUhlenbeckStrategy(BaseStrategy): + """ + Ornstein-Uhlenbeck process: dxt = theta * (mu - xt) * dt + sigma * dWt + where Wt denotes the Wiener process. + """ + def __init__( + self, + action_dim, + action_low, + action_high, + eps, + mu=0.0, + theta=.15, + sigma=.3, + decay=NoDecay() + ): + super(OrnsteinUhlenbeckStrategy, self).__init__() + self.eps = eps + self.cur_eps = eps + + self._decay_method = decay + + self._action_dim = action_dim + self._action_low = action_low + self._action_high = action_high + + self._mu = mu + self._theta = theta + self._sigma = sigma + + self.state = np.ones(self._action_dim) * self._mu + + def _evolve_state(self): + x = self.state + dx = self._theta * (self._mu - x)\ + + self._sigma * np.random.randn(len(x)) + self.state = x + dx + return self.state + + def reset(self): + self.state = np.ones(self._action_dim) * self._mu + + def select_action(self, values): + noise = self._evolve_state() + action = values + (self.cur_eps * noise) + return np.clip(action, self._action_low, self._action_high) diff --git a/src/test/resources/target_code/ros-ddpg/CNNTrainer_rosActorNetwork.py b/src/test/resources/target_code/ros-ddpg/CNNTrainer_rosActorNetwork.py new file mode 100644 index 0000000..2458ab8 --- /dev/null +++ b/src/test/resources/target_code/ros-ddpg/CNNTrainer_rosActorNetwork.py @@ -0,0 +1,141 @@ +from reinforcement_learning.agent import DdpgAgent +from reinforcement_learning.util import AgentSignalHandler +from reinforcement_learning.cnnarch_logger import ArchLogger +from reinforcement_learning.CNNCreator_RosCriticNetwork import CNNCreator_RosCriticNetwork +import reinforcement_learning.environment +import CNNCreator_rosActorNetwork + +import os +import sys +import re +import time +import numpy as np +import mxnet as mx + + +def resume_session(sessions_dir): + resume_session = False + resume_directory = None + if os.path.isdir(sessions_dir): + regex = re.compile(r'\d\d\d\d-\d\d-\d\d-\d\d-\d\d') + dir_content = os.listdir(sessions_dir) + session_files = filter(regex.search, dir_content) + session_files.sort(reverse=True) + for d in session_files: + interrupted_session_dir = os.path.join(sessions_dir, d, '.interrupted_session') + if os.path.isdir(interrupted_session_dir): + resume = raw_input('Interrupted session from {} found. Do you want to resume? (y/n) '.format(d)) + if resume == 'y': + resume_session = True + resume_directory = interrupted_session_dir + break + return resume_session, resume_directory + + +if __name__ == "__main__": + agent_name = 'ddpg-agent' + # Prepare output directory and logger + all_output_dir = os.path.join('model', agent_name) + output_directory = os.path.join( + all_output_dir, + time.strftime('%Y-%m-%d-%H-%M-%S', + time.localtime(time.time()))) + ArchLogger.set_output_directory(output_directory) + ArchLogger.set_logger_name(agent_name) + ArchLogger.set_output_level(ArchLogger.INFO) + + env_params = { + 'ros_node_name': 'rosActorNetworkTrainerNode', + 'state_topic': '/environment/state', + 'action_topic': '/environment/action', + 'reset_topic': '/environment/reset', + } + env = reinforcement_learning.environment.RosEnvironment(**env_params) + + context = mx.cpu() + actor_creator = CNNCreator_rosActorNetwork.CNNCreator_rosActorNetwork() + actor_creator.construct(context) + critic_creator = CNNCreator_RosCriticNetwork() + critic_creator.construct(context) + + agent_params = { + 'environment': env, + 'replay_memory_params': { + 'method': 'buffer', + 'memory_size': 1500000, + 'sample_size': 128, + 'state_dtype': 'float32', + 'action_dtype': 'float32', + 'rewards_dtype': 'float32' + }, + 'strategy_params': { + 'method':'ornstein_uhlenbeck', + 'epsilon': 1, + 'min_epsilon': 0.001, + 'epsilon_decay_method': 'linear', + 'epsilon_decay': 0.0001, + 'epsilon_decay_start': 50, + 'action_low': -1, + 'action_high': 1, + 'mu': [0, 0.1, 0.3], + 'theta': [0.5, 0, 0.8], + 'sigma': [0.3, 0.6, -0.9], + }, + 'agent_name': agent_name, + 'verbose': True, + 'output_directory': output_directory, + 'state_dim': (8,), + 'action_dim': (3,), + 'discount_factor': 0.9998, + 'training_episodes': 2500, + 'train_interval': 1, + 'start_training': 50, + 'snapshot_interval': 500, + 'max_episode_step': 2000, + 'evaluation_samples': 1000, + 'actor': actor_creator.net, + 'critic': critic_creator.net, + 'soft_target_update_rate': 0.001, + 'actor_optimizer': 'adam', + 'actor_optimizer_params': { + 'learning_rate_minimum': 1.0E-5, + 'learning_rate': 0.001}, + 'critic_optimizer': 'rmsprop', + 'critic_optimizer_params': { + 'weight_decay': 0.01, + 'centered': True, + 'gamma2': 0.9, + 'gamma1': 0.9, + 'clip_weights': 10.0, + 'learning_rate_decay': 0.9, + 'epsilon': 1.0E-6, + 'rescale_grad': 1.1, + 'clip_gradient': 10.0, + 'learning_rate_minimum': 1.0E-5, + 'learning_rate_policy': 'step', + 'learning_rate': 0.001, + 'step_size': 1000}, + } + + resume, resume_directory = resume_session(all_output_dir) + + if resume: + output_directory, _ = os.path.split(resume_directory) + ArchLogger.set_output_directory(output_directory) + resume_agent_params = { + 'session_dir': resume_directory, + 'environment': env, + 'actor': actor_creator.net, + 'critic': critic_creator.net + } + agent = DdpgAgent.resume_from_session(**resume_agent_params) + else: + agent = DdpgAgent(**agent_params) + + signal_handler = AgentSignalHandler() + signal_handler.register_agent(agent) + + train_successful = agent.train() + + if train_successful: + agent.save_best_network(actor_creator._model_dir_ + actor_creator._model_prefix_ + '_newest', epoch=0) diff --git a/src/test/resources/target_code/ros-ddpg/reinforcement_learning/CNNCreator_RosCriticNetwork.py b/src/test/resources/target_code/ros-ddpg/reinforcement_learning/CNNCreator_RosCriticNetwork.py new file mode 100644 index 0000000..0a486a4 --- /dev/null +++ b/src/test/resources/target_code/ros-ddpg/reinforcement_learning/CNNCreator_RosCriticNetwork.py @@ -0,0 +1,56 @@ +import mxnet as mx +import logging +import os +from CNNNet_RosCriticNetwork import Net + +class CNNCreator_RosCriticNetwork: + _model_dir_ = "model/RosCriticNetwork/" + _model_prefix_ = "model" + _input_shapes_ = [(8,),(3,)] + + def __init__(self): + self.weight_initializer = mx.init.Normal() + self.net = None + + def get_input_shapes(self): + return self._input_shapes_ + + def load(self, context): + lastEpoch = 0 + param_file = None + + try: + os.remove(self._model_dir_ + self._model_prefix_ + "_newest-0000.params") + except OSError: + pass + try: + os.remove(self._model_dir_ + self._model_prefix_ + "_newest-symbol.json") + except OSError: + pass + + if os.path.isdir(self._model_dir_): + for file in os.listdir(self._model_dir_): + if ".params" in file and self._model_prefix_ in file: + epochStr = file.replace(".params","").replace(self._model_prefix_ + "-","") + epoch = int(epochStr) + if epoch > lastEpoch: + lastEpoch = epoch + param_file = file + if param_file is None: + return 0 + else: + logging.info("Loading checkpoint: " + param_file) + self.net.load_parameters(self._model_dir_ + param_file) + return lastEpoch + + + def construct(self, context, data_mean=None, data_std=None): + self.net = Net(data_mean=data_mean, data_std=data_std) + self.net.collect_params().initialize(self.weight_initializer, ctx=context) + self.net.hybridize() + self.net(mx.nd.zeros((1,)+self._input_shapes_[0], ctx=context),mx.nd.zeros((1,)+self._input_shapes_[1], ctx=context)) + + if not os.path.exists(self._model_dir_): + os.makedirs(self._model_dir_) + + self.net.export(self._model_dir_ + self._model_prefix_, epoch=0) diff --git a/src/test/resources/target_code/ros-ddpg/reinforcement_learning/CNNNet_RosCriticNetwork.py b/src/test/resources/target_code/ros-ddpg/reinforcement_learning/CNNNet_RosCriticNetwork.py new file mode 100644 index 0000000..8444a87 --- /dev/null +++ b/src/test/resources/target_code/ros-ddpg/reinforcement_learning/CNNNet_RosCriticNetwork.py @@ -0,0 +1,119 @@ +import mxnet as mx +import numpy as np +from mxnet import gluon + +class Softmax(gluon.HybridBlock): + def __init__(self, **kwargs): + super(Softmax, self).__init__(**kwargs) + + def hybrid_forward(self, F, x): + return F.softmax(x) + + +class Split(gluon.HybridBlock): + def __init__(self, num_outputs, axis=1, **kwargs): + super(Split, self).__init__(**kwargs) + with self.name_scope(): + self.axis = axis + self.num_outputs = num_outputs + + def hybrid_forward(self, F, x): + return F.split(data=x, axis=self.axis, num_outputs=self.num_outputs) + + +class Concatenate(gluon.HybridBlock): + def __init__(self, dim=1, **kwargs): + super(Concatenate, self).__init__(**kwargs) + with self.name_scope(): + self.dim = dim + + def hybrid_forward(self, F, *x): + return F.concat(*x, dim=self.dim) + + +class ZScoreNormalization(gluon.HybridBlock): + def __init__(self, data_mean, data_std, **kwargs): + super(ZScoreNormalization, self).__init__(**kwargs) + with self.name_scope(): + self.data_mean = self.params.get('data_mean', shape=data_mean.shape, + init=mx.init.Constant(data_mean.asnumpy().tolist()), differentiable=False) + self.data_std = self.params.get('data_std', shape=data_mean.shape, + init=mx.init.Constant(data_std.asnumpy().tolist()), differentiable=False) + + def hybrid_forward(self, F, x, data_mean, data_std): + x = F.broadcast_sub(x, data_mean) + x = F.broadcast_div(x, data_std) + return x + + +class Padding(gluon.HybridBlock): + def __init__(self, padding, **kwargs): + super(Padding, self).__init__(**kwargs) + with self.name_scope(): + self.pad_width = padding + + def hybrid_forward(self, F, x): + x = F.pad(data=x, + mode='constant', + pad_width=self.pad_width, + constant_value=0) + return x + + +class NoNormalization(gluon.HybridBlock): + def __init__(self, **kwargs): + super(NoNormalization, self).__init__(**kwargs) + + def hybrid_forward(self, F, x): + return x + + +class Net(gluon.HybridBlock): + def __init__(self, data_mean=None, data_std=None, **kwargs): + super(Net, self).__init__(**kwargs) + with self.name_scope(): + if not data_mean is None: + assert(not data_std is None) + self.state_input_normalization = ZScoreNormalization(data_mean=data_mean, data_std=data_std) + else: + self.state_input_normalization = NoNormalization() + + self.fc2_1_ = gluon.nn.Dense(units=300, use_bias=True) + # fc2_1_, output shape: {[300,1,1]} + + self.relu2_1_ = gluon.nn.Activation(activation='relu') + self.fc3_1_ = gluon.nn.Dense(units=600, use_bias=True) + # fc3_1_, output shape: {[600,1,1]} + + if not data_mean is None: + assert(not data_std is None) + self.action_input_normalization = ZScoreNormalization(data_mean=data_mean, data_std=data_std) + else: + self.action_input_normalization = NoNormalization() + + self.fc2_2_ = gluon.nn.Dense(units=600, use_bias=True) + # fc2_2_, output shape: {[600,1,1]} + + self.fc4_ = gluon.nn.Dense(units=600, use_bias=True) + # fc4_, output shape: {[600,1,1]} + + self.relu4_ = gluon.nn.Activation(activation='relu') + self.fc5_ = gluon.nn.Dense(units=1, use_bias=True) + # fc5_, output shape: {[1,1,1]} + + + self.last_layer = 'linear' + + + def hybrid_forward(self, F, state, action): + state = self.state_input_normalization(state) + fc2_1_ = self.fc2_1_(state) + relu2_1_ = self.relu2_1_(fc2_1_) + fc3_1_ = self.fc3_1_(relu2_1_) + action = self.action_input_normalization(action) + fc2_2_ = self.fc2_2_(action) + add4_ = fc3_1_ + fc2_2_ + fc4_ = self.fc4_(add4_) + relu4_ = self.relu4_(fc4_) + fc5_ = self.fc5_(relu4_) + return fc5_ diff --git a/src/test/resources/target_code/ros-ddpg/reinforcement_learning/__init__.py b/src/test/resources/target_code/ros-ddpg/reinforcement_learning/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/test/resources/target_code/ros-ddpg/reinforcement_learning/agent.py b/src/test/resources/target_code/ros-ddpg/reinforcement_learning/agent.py new file mode 100644 index 0000000..c06b32a --- /dev/null +++ b/src/test/resources/target_code/ros-ddpg/reinforcement_learning/agent.py @@ -0,0 +1,900 @@ +import mxnet as mx +import numpy as np +import time +import os +import sys +import util +import matplotlib.pyplot as plt +import pyprind +from cnnarch_logger import ArchLogger +from replay_memory import ReplayMemoryBuilder +from strategy import StrategyBuilder +from util import copy_net, get_loss_function,\ + copy_net_with_two_inputs, DdpgTrainingStats, DqnTrainingStats,\ + make_directory_if_not_exist +from mxnet import nd, gluon, autograd + + +class Agent(object): + def __init__( + self, + environment, + replay_memory_params, + strategy_params, + state_dim, + action_dim, + ctx=None, + discount_factor=.9, + training_episodes=50, + train_interval=1, + start_training=0, + snapshot_interval=200, + agent_name='Agent', + max_episode_step=99999, + evaluation_samples=1000, + output_directory='model_parameters', + verbose=True, + target_score=None + ): + assert 0 < discount_factor <= 1,\ + 'Discount factor must be between 0 and 1' + assert train_interval > 0, 'Train interval must be greater 0' + assert snapshot_interval > 0, 'Snapshot interval must be greater 0' + assert max_episode_step > 0,\ + 'Maximal steps per episode must be greater 0' + assert training_episodes > 0, 'Trainings episode must be greater 0' + assert replay_memory_params is not None,\ + 'Replay memory parameter not set' + assert type(state_dim) is tuple, 'State dimension is not a tuple' + assert type(action_dim) is tuple, 'Action dimension is not a tuple' + + self._logger = ArchLogger.get_logger() + self._ctx = mx.gpu() if ctx == 'gpu' else mx.cpu() + self._environment = environment + self._discount_factor = discount_factor + self._training_episodes = training_episodes + self._train_interval = train_interval + self._verbose = verbose + self._state_dim = state_dim + + replay_memory_params['state_dim'] = state_dim + replay_memory_params['action_dim'] = action_dim + self._replay_memory_params = replay_memory_params + rm_builder = ReplayMemoryBuilder() + self._memory = rm_builder.build_by_params(**replay_memory_params) + self._minibatch_size = self._memory.sample_size + self._action_dim = action_dim + + strategy_params['action_dim'] = self._action_dim + self._strategy_params = strategy_params + strategy_builder = StrategyBuilder() + self._strategy = strategy_builder.build_by_params(**strategy_params) + self._agent_name = agent_name + self._snapshot_interval = snapshot_interval + self._creation_time = time.time() + self._max_episode_step = max_episode_step + self._start_training = start_training + self._output_directory = output_directory + self._target_score = target_score + + self._evaluation_samples = evaluation_samples + self._best_avg_score = -np.infty + self._best_net = None + + self._interrupt_flag = False + self._training_stats = None + + # Training Context + self._current_episode = 0 + self._total_steps = 0 + + @property + def current_episode(self): + return self._current_episode + + @property + def environment(self): + return self._environment + + def save_config_file(self): + import json + make_directory_if_not_exist(self._output_directory) + filename = os.path.join(self._output_directory, 'config.json') + config = self._make_config_dict() + with open(filename, mode='w') as fp: + json.dump(config, fp, indent=4) + + def set_interrupt_flag(self, interrupt): + self._interrupt_flag = interrupt + + def _interrupt_training(self): + import pickle + self._logger.info('Training interrupted; Store state for resuming') + session_dir = self._get_session_dir() + agent_session_file = os.path.join(session_dir, 'agent.p') + logger = self._logger + + self._make_pickle_ready(session_dir) + + with open(agent_session_file, 'wb') as f: + pickle.dump(self, f, protocol=2) + logger.info('State successfully stored') + + def _make_pickle_ready(self, session_dir): + del self._training_stats.logger + self._logger = None + self._environment.close() + self._environment = None + self._save_net(self._best_net, 'best_net', session_dir) + self._best_net = None + + def _make_config_dict(self): + config = dict() + config['state_dim'] = self._state_dim + config['action_dim'] = self._action_dim + config['ctx'] = str(self._ctx) + config['discount_factor'] = self._discount_factor + config['strategy_params'] = self._strategy_params + config['replay_memory_params'] = self._replay_memory_params + config['training_episodes'] = self._training_episodes + config['start_training'] = self._start_training + config['evaluation_samples'] = self._evaluation_samples + config['train_interval'] = self._train_interval + config['snapshot_interval'] = self._snapshot_interval + config['agent_name'] = self._agent_name + config['max_episode_step'] = self._max_episode_step + config['output_directory'] = self._output_directory + config['verbose'] = self._verbose + config['target_score'] = self._target_score + return config + + def _adjust_optimizer_params(self, optimizer_params): + if 'weight_decay' in optimizer_params: + optimizer_params['wd'] = optimizer_params['weight_decay'] + del optimizer_params['weight_decay'] + if 'learning_rate_decay' in optimizer_params: + min_learning_rate = 1e-8 + if 'learning_rate_minimum' in optimizer_params: + min_learning_rate = optimizer_params['learning_rate_minimum'] + del optimizer_params['learning_rate_minimum'] + optimizer_params['lr_scheduler'] = mx.lr_scheduler.FactorScheduler( + optimizer_params['step_size'], + factor=optimizer_params['learning_rate_decay'], + stop_factor_lr=min_learning_rate) + del optimizer_params['step_size'] + del optimizer_params['learning_rate_decay'] + + return optimizer_params + + def _sample_from_memory(self): + states, actions, rewards, next_states, terminals\ + = self._memory.sample(batch_size=self._minibatch_size) + states = nd.array(states, ctx=self._ctx) + actions = nd.array(actions, ctx=self._ctx) + rewards = nd.array(rewards, ctx=self._ctx) + next_states = nd.array(next_states, ctx=self._ctx) + terminals = nd.array(terminals, ctx=self._ctx) + return states, actions, rewards, next_states, terminals + + def evaluate(self, target=None, sample_games=100, verbose=True): + target = self._target_score if target is None else target + if target: + target_achieved = 0 + total_reward = 0 + + self._logger.info('Sampling from {} games...'.format(sample_games)) + for g in pyprind.prog_bar(range(sample_games)): + state = self._environment.reset() + step = 0 + game_reward = 0 + terminal = False + while not terminal and (step < self._max_episode_step): + action = self.get_next_action(state) + state, reward, terminal, _ = self._environment.step(action) + game_reward += reward + step += 1 + + if verbose: + info = 'Game %d: Reward %f' % (g, game_reward) + self._logger.debug(info) + if target: + if game_reward >= target: + target_achieved += 1 + total_reward += game_reward + + avg_reward = float(total_reward)/float(sample_games) + info = 'Avg. Reward: %f' % avg_reward + if target: + target_achieved_ratio = int( + (float(target_achieved)/float(sample_games))*100) + info += '; Target Achieved in %d%% of games'\ + % (target_achieved_ratio) + + if verbose: + self._logger.info(info) + return avg_reward + + def _do_snapshot_if_in_interval(self, episode): + do_snapshot =\ + (episode != 0 and (episode % self._snapshot_interval == 0)) + if do_snapshot: + self.save_parameters(episode=episode) + self._evaluate() + + def _evaluate(self, verbose=True): + avg_reward = self.evaluate( + sample_games=self._evaluation_samples, verbose=False) + info = 'Evaluation -> Average Reward in {} games: {}'.format( + self._evaluation_samples, avg_reward) + + if self._best_avg_score is None or self._best_avg_score <= avg_reward: + self._save_current_as_best_net() + self._best_avg_score = avg_reward + if verbose: + self._logger.info(info) + + def _is_target_reached(self, avg_reward): + return self._target_score is not None\ + and avg_reward > self._target_score + + def _do_training(self): + return (self._total_steps % self._train_interval == 0) and\ + (self._memory.is_sample_possible(self._minibatch_size)) and\ + (self._current_episode >= self._start_training) + + def _check_interrupt_routine(self): + if self._interrupt_flag: + self._interrupt_flag = False + self._interrupt_training() + return True + return False + + def _is_target_reached(self, avg_reward): + return self._target_score is not None\ + and avg_reward > self._target_score + + def _save_parameters(self, net, episode=None, filename='dqn-agent-params'): + assert self._output_directory + assert isinstance(net, gluon.HybridBlock) + make_directory_if_not_exist(self._output_directory) + + if(episode is not None): + self._logger.info( + 'Saving model parameters after episode %d' % episode) + filename = filename + '-ep{}'.format(episode) + else: + self._logger.info('Saving model parameters') + self._save_net(net, filename) + + def _save_net(self, net, filename, filedir=None): + filedir = self._output_directory if filedir is None else filedir + filename = os.path.join(filedir, filename + '.params') + net.save_parameters(filename) + + def save_best_network(self, path, epoch=0): + self._logger.info( + 'Saving best network with average reward of {}'.format( + self._best_avg_score)) + self._best_net.export(path, epoch=epoch) + + def _get_session_dir(self): + session_dir = os.path.join( + self._output_directory, '.interrupted_session') + make_directory_if_not_exist(session_dir) + return session_dir + + def _save_current_as_best_net(self): + raise NotImplementedError + + def get_next_action(self, state): + raise NotImplementedError + + def save_parameters(self, episode): + raise NotImplementedError + + def train(self, episodes=None): + raise NotImplementedError + + +class DdpgAgent(Agent): + def __init__( + self, + actor, + critic, + environment, + replay_memory_params, + strategy_params, + state_dim, + action_dim, + soft_target_update_rate=.001, + actor_optimizer='adam', + actor_optimizer_params={'learning_rate': 0.0001}, + critic_optimizer='adam', + critic_optimizer_params={'learning_rate': 0.001}, + ctx=None, + discount_factor=.9, + training_episodes=50, + start_training=20, + train_interval=1, + snapshot_interval=200, + agent_name='DdpgAgent', + max_episode_step=9999, + evaluation_samples=100, + output_directory='model_parameters', + verbose=True, + target_score=None + ): + super(DdpgAgent, self).__init__( + environment=environment, replay_memory_params=replay_memory_params, + strategy_params=strategy_params, state_dim=state_dim, + action_dim=action_dim, ctx=ctx, discount_factor=discount_factor, + training_episodes=training_episodes, start_training=start_training, + train_interval=train_interval, + snapshot_interval=snapshot_interval, agent_name=agent_name, + max_episode_step=max_episode_step, + output_directory=output_directory, verbose=verbose, + target_score=target_score, evaluation_samples=evaluation_samples) + assert critic is not None, 'Critic not set' + assert actor is not None, 'Actor is not set' + assert soft_target_update_rate > 0,\ + 'Target update must be greater zero' + assert actor_optimizer is not None, 'No actor optimizer set' + assert critic_optimizer is not None, 'No critic optimizer set' + + self._actor = actor + self._critic = critic + + self._actor_target = self._copy_actor() + self._critic_target = self._copy_critic() + + self._actor_optimizer = actor_optimizer + self._actor_optimizer_params = self._adjust_optimizer_params( + actor_optimizer_params) + + self._critic_optimizer = critic_optimizer + self._critic_optimizer_params = self._adjust_optimizer_params( + critic_optimizer_params) + + self._soft_target_update_rate = soft_target_update_rate + + self._logger.info( + 'Agent created with following parameters: {}'.format( + self._make_config_dict())) + + self._best_net = self._copy_actor() + + self._training_stats = DdpgTrainingStats(self._training_episodes) + + def _make_pickle_ready(self, session_dir): + super(DdpgAgent, self)._make_pickle_ready(session_dir) + self._save_net(self._actor, 'actor', session_dir) + self._actor = None + self._save_net(self._critic, 'critic', session_dir) + self._critic = None + self._save_net(self._actor_target, 'actor_target', session_dir) + self._actor_target = None + self._save_net(self._critic_target, 'critic_target', session_dir) + self._critic_target = None + + @classmethod + def resume_from_session(cls, session_dir, actor, critic, environment): + import pickle + if not os.path.exists(session_dir): + raise ValueError('Session directory does not exist') + + files = dict() + files['agent'] = os.path.join(session_dir, 'agent.p') + files['best_net_params'] = os.path.join(session_dir, 'best_net.params') + files['actor_net_params'] = os.path.join(session_dir, 'actor.params') + files['actor_target_net_params'] = os.path.join( + session_dir, 'actor_target.params') + files['critic_net_params'] = os.path.join(session_dir, 'critic.params') + files['critic_target_net_params'] = os.path.join( + session_dir, 'critic_target.params') + + for file in files.values(): + if not os.path.exists(file): + raise ValueError( + 'Session directory is not complete: {} is missing' + .format(file)) + + with open(files['agent'], 'rb') as f: + agent = pickle.load(f) + + agent._environment = environment + + agent._actor = actor + agent._actor.load_parameters(files['actor_net_params'], agent._ctx) + agent._actor.hybridize() + agent._actor(nd.random_normal( + shape=((1,) + agent._state_dim), ctx=agent._ctx)) + + agent._best_net = copy_net(agent._actor, agent._state_dim, agent._ctx) + agent._best_net.load_parameters(files['best_net_params'], agent._ctx) + + agent._actor_target = copy_net( + agent._actor, agent._state_dim, agent._ctx) + agent._actor_target.load_parameters(files['actor_target_net_params']) + + agent._critic = critic + agent._critic.load_parameters(files['critic_net_params'], agent._ctx) + agent._critic.hybridize() + agent._critic( + nd.random_normal(shape=((1,) + agent._state_dim), ctx=agent._ctx), + nd.random_normal(shape=((1,) + agent._action_dim), ctx=agent._ctx)) + + agent._critic_target = copy_net_with_two_inputs( + agent._critic, agent._state_dim, agent._action_dim, agent._ctx) + agent._critic_target.load_parameters(files['critic_target_net_params']) + + agent._logger = ArchLogger.get_logger() + agent._training_stats.logger = ArchLogger.get_logger() + agent._logger.info('Agent was retrieved; Training can be continued') + + return agent + + def _save_current_as_best_net(self): + self._best_net = self._copy_actor() + + def get_next_action(self, state): + action = self._actor(nd.array([state], ctx=self._ctx)) + return action[0].asnumpy() + + def save_parameters(self, episode): + self._save_parameters(self._actor, episode=episode) + + def train(self, episodes=None): + self.save_config_file() + self._logger.info("--- Start DDPG training ---") + episodes = \ + episodes if episodes is not None else self._training_episodes + + resume = (self._current_episode > 0) + if resume: + self._logger.info("Training session resumed") + self._logger.info( + "Starting from episode {}".format(self._current_episode)) + else: + self._training_stats = DdpgTrainingStats(episodes) + + # Initialize target Q' and mu' + self._actor_target = self._copy_actor() + self._critic_target = self._copy_critic() + + # Initialize l2 loss for critic network + l2_loss = gluon.loss.L2Loss() + + # Initialize critic and actor trainer + trainer_actor = gluon.Trainer( + self._actor.collect_params(), self._actor_optimizer, + self._actor_optimizer_params) + trainer_critic = gluon.Trainer( + self._critic.collect_params(), self._critic_optimizer, + self._critic_optimizer_params) + + # For episode=1..n + while self._current_episode < episodes: + # Check interrupt flag + if self._check_interrupt_routine(): + return False + + # Initialize new episode + step = 0 + episode_reward = 0 + start = time.time() + episode_critic_loss = 0 + episode_actor_loss = 0 + episode_avg_q_value = 0 + training_steps = 0 + + # Get initialial observation state s + state = self._environment.reset() + + # For step=1..T + while step < self._max_episode_step: + # Select an action a = mu(s) + N(step) according to current + # actor and exploration noise N according to strategy + action = self._strategy.select_action( + self.get_next_action(state)) + + # Execute action a and observe reward r and next state ns + next_state, reward, terminal, _ = \ + self._environment.step(action) + + self._logger.debug( + 'Applied action {} with reward {}'.format(action, reward)) + + # Store transition (s,a,r,ns) in replay buffer + self._memory.append( + state, action, reward, next_state, terminal) + + if self._do_training(): + # Sample random minibatch of b transitions + # (s_i, a_i, r_i, s_(i+1)) from replay buffer + states, actions, rewards, next_states, terminals =\ + self._sample_from_memory() + + actor_target_actions = self._actor_target(next_states) + critic_target_qvalues = self._critic_target( + next_states, actor_target_actions) + + rewards = rewards.reshape(self._minibatch_size, 1) + terminals = terminals.reshape(self._minibatch_size, 1) + + # y = r_i + discount * Q'(s_(i+1), mu'(s_(i+1))) + y = rewards + (1.0 - terminals) * self._discount_factor\ + * critic_target_qvalues + + # Train the critic network + with autograd.record(): + qvalues = self._critic(states, actions) + critic_loss = l2_loss(qvalues, y) + critic_loss.backward() + trainer_critic.step(self._minibatch_size) + + # Train the actor network + # Temporary critic so that gluon trainer does not mess + # with critic parameters + tmp_critic = self._copy_critic() + with autograd.record(): + actor_qvalues = tmp_critic(states, self._actor(states)) + # For maximizing qvalues we have to multiply with -1 + # as we use a minimizer + actor_loss = -1 * actor_qvalues + actor_loss.backward() + trainer_actor.step(self._minibatch_size) + + # Update target networks: + self._actor_target = self._soft_update( + self._actor, self._actor_target, + self._soft_target_update_rate) + self._critic_target = self._soft_update( + self._critic, self._critic_target, + self._soft_target_update_rate) + + # Update statistics + episode_critic_loss +=\ + np.sum(critic_loss.asnumpy()) / self._minibatch_size + episode_actor_loss +=\ + np.sum(actor_loss.asnumpy()) / self._minibatch_size + episode_avg_q_value +=\ + np.sum(actor_qvalues.asnumpy()) / self._minibatch_size + + training_steps += 1 + + episode_reward += reward + step += 1 + self._total_steps += 1 + state = next_state + + if terminal: + # Reset the strategy + self._strategy.reset() + break + + # Log the episode results + episode_actor_loss = 0 if training_steps == 0\ + else (episode_actor_loss / training_steps) + episode_critic_loss = 0 if training_steps == 0\ + else (episode_critic_loss / training_steps) + episode_avg_q_value = 0 if training_steps == 0\ + else (episode_avg_q_value / training_steps) + + avg_reward = self._training_stats.log_episode( + self._current_episode, start, training_steps, + episode_actor_loss, episode_critic_loss, episode_avg_q_value, + self._strategy.cur_eps, episode_reward) + + self._do_snapshot_if_in_interval(self._current_episode) + self._strategy.decay(self._current_episode) + + if self._is_target_reached(avg_reward): + self._logger.info( + 'Target score is reached in average; Training is stopped') + break + + self._current_episode += 1 + + self._evaluate() + self.save_parameters(episode=self._current_episode) + self.save_best_network(os.path.join(self._output_directory, 'best')) + self._training_stats.save_stats(self._output_directory) + self._logger.info('--------- Training finished ---------') + return True + + def _make_config_dict(self): + config = super(DdpgAgent, self)._make_config_dict() + config['soft_target_update_rate'] = self._soft_target_update_rate + config['actor_optimizer'] = self._actor_optimizer + config['actor_optimizer_params'] = self._actor_optimizer_params + config['critic_optimizer'] = self._critic_optimizer + config['critic_optimizer_params'] = self._critic_optimizer_params + return config + + def _soft_update(self, net, target, tau): + net_params = [p.data() for _, p in net.collect_params().items()] + for i, (_, p) in enumerate(target.collect_params().items()): + target_params = p.data() + p.set_data((1.0 - tau) * target_params + tau * net_params[i]) + return target + + def _copy_actor(self): + assert self._actor is not None + assert self._ctx is not None + assert type(self._state_dim) is tuple + return copy_net(self._actor, self._state_dim, ctx=self._ctx) + + def _copy_critic(self): + assert self._critic is not None + assert self._ctx is not None + assert type(self._state_dim) is tuple + assert type(self._action_dim) is tuple + return copy_net_with_two_inputs( + self._critic, self._state_dim, self._action_dim, ctx=self._ctx) + + +class DqnAgent(Agent): + def __init__( + self, + qnet, + environment, + replay_memory_params, + strategy_params, + state_dim, + action_dim, + ctx=None, + discount_factor=.9, + loss_function='euclidean', + optimizer='rmsprop', + optimizer_params={'learning_rate': 0.09}, + training_episodes=50, + start_training=0, + train_interval=1, + use_fix_target=False, + double_dqn=False, + target_update_interval=10, + snapshot_interval=200, + evaluation_samples=100, + agent_name='Dqn_agent', + max_episode_step=99999, + output_directory='model_parameters', + verbose=True, + target_score=None + ): + super(DqnAgent, self).__init__( + environment=environment, replay_memory_params=replay_memory_params, + strategy_params=strategy_params, state_dim=state_dim, + action_dim=action_dim, ctx=ctx, discount_factor=discount_factor, + training_episodes=training_episodes, start_training=start_training, + train_interval=train_interval, + snapshot_interval=snapshot_interval, agent_name=agent_name, + max_episode_step=max_episode_step, + output_directory=output_directory, verbose=verbose, + target_score=target_score, evaluation_samples=evaluation_samples) + + self._qnet = qnet + self._target_update_interval = target_update_interval + self._target_qnet = copy_net( + self._qnet, self._state_dim, ctx=self._ctx) + self._loss_function_str = loss_function + self._loss_function = get_loss_function(loss_function) + self._optimizer = optimizer + self._optimizer_params = optimizer_params + self._double_dqn = double_dqn + self._use_fix_target = use_fix_target + + # Initialize best network + self._best_net = copy_net(self._qnet, self._state_dim, self._ctx) + self._best_avg_score = -np.infty + + self._training_stats = None + + @classmethod + def resume_from_session(cls, session_dir, net, environment): + import pickle + if not os.path.exists(session_dir): + raise ValueError('Session directory does not exist') + + files = dict() + files['agent'] = os.path.join(session_dir, 'agent.p') + files['best_net_params'] = os.path.join(session_dir, 'best_net.params') + files['q_net_params'] = os.path.join(session_dir, 'qnet.params') + files['target_net_params'] = os.path.join( + session_dir, 'target_net.params') + + for file in files.values(): + if not os.path.exists(file): + raise ValueError( + 'Session directory is not complete: {} is missing' + .format(file)) + + with open(files['agent'], 'rb') as f: + agent = pickle.load(f) + + agent._environment = environment + agent._qnet = net + agent._qnet.load_parameters(files['q_net_params'], agent._ctx) + agent._qnet.hybridize() + agent._qnet(nd.random_normal( + shape=((1,) + agent._state_dim), ctx=agent._ctx)) + agent._best_net = copy_net(agent._qnet, agent._state_dim, agent._ctx) + agent._best_net.load_parameters(files['best_net_params'], agent._ctx) + agent._target_qnet = copy_net( + agent._qnet, agent._state_dim, agent._ctx) + agent._target_qnet.load_parameters( + files['target_net_params'], agent._ctx) + + agent._logger = ArchLogger.get_logger() + agent._training_stats.logger = agent._logger + agent._logger.info('Agent was retrieved; Training can be continued') + + return agent + + def _make_pickle_ready(self, session_dir): + super(DqnAgent, self)._make_pickle_ready(session_dir) + self._save_net(self._qnet, 'qnet', session_dir) + self._qnet = None + self._save_net(self._target_qnet, 'target_net', session_dir) + self._target_qnet = None + + def get_q_values(self, state, with_best=False): + return self.get_batch_q_values( + nd.array([state], ctx=self._ctx), with_best=with_best)[0] + + def get_batch_q_values(self, state_batch, with_best=False): + return self._best_net(state_batch)\ + if with_best else self._qnet(state_batch) + + def get_next_action(self, state, with_best=False): + q_values = self.get_q_values(state, with_best=with_best) + action = q_values.asnumpy().argmax() + return action + + def __determine_target_q_values( + self, states, actions, rewards, next_states, terminals + ): + if self._use_fix_target: + q_max_val = self._target_qnet(next_states) + else: + q_max_val = self._qnet(next_states) + + if self._double_dqn: + q_values_next_states = self._qnet(next_states) + target_rewards = rewards + nd.choose_element_0index( + q_max_val, nd.argmax_channel(q_values_next_states))\ + * (1.0 - terminals) * self._discount_factor + else: + target_rewards = rewards + nd.choose_element_0index( + q_max_val, nd.argmax_channel(q_max_val))\ + * (1.0 - terminals) * self._discount_factor + + target_qval = self._qnet(states) + for t in range(target_rewards.shape[0]): + target_qval[t][actions[t]] = target_rewards[t] + + return target_qval + + def __train_q_net_step(self, trainer): + states, actions, rewards, next_states, terminals =\ + self._sample_from_memory() + target_qval = self.__determine_target_q_values( + states, actions, rewards, next_states, terminals) + with autograd.record(): + q_values = self._qnet(states) + loss = self._loss_function(q_values, target_qval) + loss.backward() + trainer.step(self._minibatch_size) + return loss + + def __do_target_update_if_in_interval(self, total_steps): + do_target_update = ( + self._use_fix_target and + (total_steps % self._target_update_interval == 0)) + if do_target_update: + self._logger.info( + 'Target network is updated after {} steps'.format(total_steps)) + self._target_qnet = copy_net( + self._qnet, self._state_dim, self._ctx) + + def train(self, episodes=None): + self.save_config_file() + self._logger.info("--- Start training ---") + trainer = gluon.Trainer( + self._qnet.collect_params(), + self._optimizer, + self._adjust_optimizer_params(self._optimizer_params)) + episodes = episodes if episodes is not None\ + else self._training_episodes + + resume = (self._current_episode > 0) + if resume: + self._logger.info("Training session resumed") + self._logger.info("Starting from episode {}".format( + self._current_episode)) + else: + self._training_stats = DqnTrainingStats(episodes) + + # Implementation Deep Q Learning described by + # Mnih et. al. in Playing Atari with Deep Reinforcement Learning + while self._current_episode < episodes: + if self._check_interrupt_routine(): + return False + + step = 0 + episode_reward = 0 + start = time.time() + state = self._environment.reset() + episode_loss = 0 + training_steps = 0 + while step < self._max_episode_step: + # 1. Choose an action based on current game state and policy + q_values = self._qnet(nd.array([state], ctx=self._ctx)) + action = self._strategy.select_action(q_values[0]) + + # 2. Play the game for a single step + next_state, reward, terminal, _ =\ + self._environment.step(action) + + # 3. Store transition in replay memory + self._memory.append( + state, action, reward, next_state, terminal) + + # 4. Train the network if in interval + if self._do_training(): + loss = self.__train_q_net_step(trainer) + training_steps += 1 + episode_loss +=\ + np.sum(loss.asnumpy()) / self._minibatch_size + + # Update target network if in interval + self.__do_target_update_if_in_interval(self._total_steps) + + step += 1 + self._total_steps += 1 + episode_reward += reward + state = next_state + + if terminal: + self._strategy.reset() + break + + self._do_snapshot_if_in_interval(self._current_episode) + + episode_loss = (episode_loss / training_steps)\ + if training_steps > 0 else 0 + avg_reward = self._training_stats.log_episode( + self._current_episode, start, training_steps, + episode_loss, self._strategy.cur_eps, episode_reward) + + self._strategy.decay(self._current_episode) + + if self._is_target_reached(avg_reward): + self._logger.info( + 'Target score is reached in average; Training is stopped') + break + + self._current_episode += 1 + + self._evaluate() + self.save_parameters(episode=self._current_episode) + self.save_best_network(os.path.join(self._output_directory, 'best')) + self._training_stats.save_stats(self._output_directory) + self._logger.info('--------- Training finished ---------') + return True + + def _make_config_dict(self): + config = super(DqnAgent, self)._make_config_dict() + config['optimizer'] = self._optimizer + config['optimizer_params'] = self._optimizer_params + config['loss_function'] = self._loss_function_str + config['use_fix_target'] = self._use_fix_target + config['double_dqn'] = self._double_dqn + config['target_update_interval'] = self._target_update_interval + return config + + def save_parameters(self, episode): + self._save_parameters(self._qnet, episode=episode) + + def _save_current_as_best_net(self): + self._best_net = copy_net( + self._qnet, (1,) + self._state_dim, ctx=self._ctx) diff --git a/src/test/resources/target_code/ros-ddpg/reinforcement_learning/cnnarch_logger.py b/src/test/resources/target_code/ros-ddpg/reinforcement_learning/cnnarch_logger.py new file mode 100644 index 0000000..19f36b1 --- /dev/null +++ b/src/test/resources/target_code/ros-ddpg/reinforcement_learning/cnnarch_logger.py @@ -0,0 +1,95 @@ +import logging +import sys +import os +import util + + +class ArchLogger(object): + _logger = None + + __output_level = logging.INFO + __logger_name = 'agent' + __output_directory = '.' + __append = True + __logformat = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + __dateformat = '%d-%b-%y %H:%M:%S' + + INFO = logging.INFO + DEBUG = logging.DEBUG + + @staticmethod + def set_output_level(output_level): + assert output_level is not None + ArchLogger.__output_level = output_level + + @staticmethod + def set_logger_name(logger_name): + assert logger_name is not None + ArchLogger.__logger_name = logger_name + + @staticmethod + def set_output_directory(output_directory): + assert output_directory is not None + ArchLogger.__output_directory = output_directory + + @staticmethod + def set_append(append): + assert append is not None + ArchLogger.__append = append + + @staticmethod + def set_log_format(logformat, dateformat): + assert logformat is not None + assert dateformat is not None + ArchLogger.__logformat = logformat + ArchLogger.__dateformat = dateformat + + @staticmethod + def init_logger(make_log_file=True): + assert ArchLogger._logger is None, 'Logger init already called' + filemode = 'a' if ArchLogger.__append else 'w' + formatter = logging.Formatter( + fmt=ArchLogger.__logformat, datefmt=ArchLogger.__dateformat) + + logger = logging.getLogger(ArchLogger.__logger_name) + logger.propagate = False + + if not logger.handlers: + logger.setLevel(ArchLogger.__output_level) + stream_handler = logging.StreamHandler(sys.stdout) + stream_handler.setLevel(ArchLogger.__output_level) + stream_handler.setFormatter(formatter) + logger.addHandler(stream_handler) + + if make_log_file: + util.make_directory_if_not_exist(ArchLogger.__output_directory) + log_file = os.path.join( + ArchLogger.__output_directory, + ArchLogger.__logger_name + '.log') + file_handler = logging.FileHandler(log_file, mode=filemode) + file_handler.setLevel(ArchLogger.__output_level) + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + ArchLogger._logger = logger + + @staticmethod + def get_logger(): + if ArchLogger._logger is None: + ArchLogger.init_logger() + assert ArchLogger._logger is not None + return ArchLogger._logger + +if __name__ == "__main__": + print('=== Test logger ===') + ArchLogger.set_logger_name('TestLogger') + ArchLogger.set_output_directory('test_log') + ArchLogger.init_logger() + logger = ArchLogger.get_logger() + logger.warning('This is a warning') + logger.debug('This is a debug information, which you should not see') + logger.info('This is a normal information') + assert os.path.exists('test_log')\ + and os.path.isfile(os.path.join('test_log', 'TestLogger.log')),\ + 'Test failed: No logfile exists' + import shutil + shutil.rmtree('test_log') \ No newline at end of file diff --git a/src/test/resources/target_code/ros-ddpg/reinforcement_learning/environment.py b/src/test/resources/target_code/ros-ddpg/reinforcement_learning/environment.py new file mode 100644 index 0000000..dc90fee --- /dev/null +++ b/src/test/resources/target_code/ros-ddpg/reinforcement_learning/environment.py @@ -0,0 +1,149 @@ +import abc +import logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) +import reward_rewardFunction_executor + +class RewardFunction(object): + def __init__(self): + self.__reward_wrapper = reward_rewardFunction_executor.reward_rewardFunction_executor() + self.__reward_wrapper.init() + + def reward(self, state, terminal): + s = state.astype('double') + t = bool(terminal) + inp = reward_rewardFunction_executor.reward_rewardFunction_input() + inp.state = s + inp.isTerminal = t + output = self.__reward_wrapper.execute(inp) + return output.reward + + + +class Environment: + __metaclass__ = abc.ABCMeta + + def __init__(self): + self._reward_function = RewardFunction() + + @abc.abstractmethod + def reset(self): + pass + + @abc.abstractmethod + def step(self, action): + pass + + @abc.abstractmethod + def close(self): + pass + +import rospy +import thread +import numpy as np +import time +from std_msgs.msg import Float32MultiArray, Bool, Int32, MultiArrayDimension, Float32 + +class RosEnvironment(Environment): + def __init__(self, + ros_node_name='RosTrainingAgent', + timeout_in_s=3, + state_topic='state', + action_topic='action', + reset_topic='reset', + terminal_state_topic='terminal', + reward_topic='reward'): + super(RosEnvironment, self).__init__() + self.__timeout_in_s = timeout_in_s + self.__waiting_for_state_update = False + self.__waiting_for_terminal_update = False + self.__last_received_state = 0 + self.__last_received_terminal = True + + rospy.loginfo("Initialize node {0}".format(ros_node_name)) + + self.__step_publisher = rospy.Publisher(action_topic, Float32MultiArray, queue_size=1) + rospy.loginfo('Step Publisher initialized with topic {}'.format(action_topic)) + + self.__reset_publisher = rospy.Publisher(reset_topic, Bool, queue_size=1) + rospy.loginfo('Reset Publisher initialized with topic {}'.format(reset_topic)) + + rospy.init_node(ros_node_name, anonymous=True) + + self.__state_subscriber = rospy.Subscriber(state_topic, Float32MultiArray, self.__state_callback) + rospy.loginfo('State Subscriber registered with topic {}'.format(state_topic)) + + self.__terminal_state_subscriber = rospy.Subscriber(terminal_state_topic, Bool, self.__terminal_state_callback) + rospy.loginfo('Terminal State Subscriber registered with topic {}'.format(terminal_state_topic)) + + rate = rospy.Rate(10) + + thread.start_new_thread(rospy.spin, ()) + time.sleep(2) + + def reset(self): + time.sleep(0.5) + reset_message = Bool() + reset_message.data = True + self.__waiting_for_state_update = True + self.__reset_publisher.publish(reset_message) + while self.__last_received_terminal: + self.__wait_for_new_state(self.__reset_publisher, reset_message) + return self.__last_received_state + + def step(self, action): + action_rospy = Float32MultiArray() + assert len(action.shape) == 1 + action_rospy.layout.dim.append(MultiArrayDimension()) + action_rospy.layout.dim[0].label = 'action' + action_rospy.layout.dim[0].size = 3 + action_rospy.layout.dim[0].stride = 3 + action_rospy.data = action + + logger.debug('Send action: {}'.format(action)) + + self.__waiting_for_state_update = True + self.__waiting_for_terminal_update = True + self.__step_publisher.publish(action_rospy) + self.__wait_for_new_state(self.__step_publisher, action_rospy) + next_state = self.__last_received_state + terminal = self.__last_received_terminal + reward = self.__calc_reward(next_state, terminal) + rospy.logdebug('Calculated reward: {}'.format(reward)) + + return next_state, reward, terminal, 0 + + def __wait_for_new_state(self, publisher, msg): + time_of_timeout = time.time() + self.__timeout_in_s + timeout_counter = 0 + while(self.__waiting_for_state_update + or self.__waiting_for_terminal_update): + is_timeout = (time.time() > time_of_timeout) + if (is_timeout): + if timeout_counter < 3: + rospy.logwarn("Timeout occured: Retry message") + publisher.publish(msg) + timeout_counter += 1 + time_of_timeout = time.time() + self.__timeout_in_s + else: + rospy.logerr("Timeout 3 times in a row: Terminate application") + exit() + time.sleep(100/1000) + + def close(self): + rospy.signal_shutdown('Program ended!') + + def __state_callback(self, data): + self.__last_received_state = np.array(data.data, dtype='float32') + rospy.logdebug('Received state: {}'.format(self.__last_received_state)) + self.__waiting_for_state_update = False + + def __terminal_state_callback(self, data): + self.__last_received_terminal = data.data + rospy.logdebug('Received terminal flag: {}'.format(self.__last_received_terminal)) + logger.debug('Received terminal: {}'.format(self.__last_received_terminal)) + self.__waiting_for_terminal_update = False + + def __calc_reward(self, state, terminal): + # C++ Wrapper call + return self._reward_function.reward(state, terminal) diff --git a/src/test/resources/target_code/ros-ddpg/reinforcement_learning/replay_memory.py b/src/test/resources/target_code/ros-ddpg/reinforcement_learning/replay_memory.py new file mode 100644 index 0000000..d22147a --- /dev/null +++ b/src/test/resources/target_code/ros-ddpg/reinforcement_learning/replay_memory.py @@ -0,0 +1,208 @@ +import numpy as np + + +class ReplayMemoryBuilder(object): + def __init__(self): + self.__supported_methods = ['online', 'buffer', 'combined'] + + def build_by_params( + self, + state_dim, + method='online', + state_dtype='float32', + action_dim=(1,), + action_dtype='uint8', + rewards_dtype='float32', + memory_size=1000, + sample_size=32 + ): + assert state_dim is not None + assert action_dim is not None + assert method in self.__supported_methods + + if method == 'online': + return self.build_online_memory( + state_dim=state_dim, state_dtype=state_dtype, + action_dtype=action_dtype, action_dim=action_dim, + rewards_dtype=rewards_dtype) + else: + assert memory_size is not None and memory_size > 0 + assert sample_size is not None and sample_size > 0 + if method == 'buffer': + return self.build_buffered_memory( + state_dim=state_dim, sample_size=sample_size, + memory_size=memory_size, state_dtype=state_dtype, + action_dim=action_dim, action_dtype=action_dtype, + rewards_dtype=rewards_dtype) + else: + return self.build_combined_memory( + state_dim=state_dim, sample_size=sample_size, + memory_size=memory_size, state_dtype=state_dtype, + action_dim=action_dim, action_dtype=action_dtype, + rewards_dtype=rewards_dtype) + + def build_buffered_memory( + self, state_dim, memory_size, sample_size, state_dtype, action_dim, + action_dtype, rewards_dtype + ): + assert memory_size > 0 + assert sample_size > 0 + return ReplayMemory( + state_dim, size=memory_size, sample_size=sample_size, + state_dtype=state_dtype, action_dim=action_dim, + action_dtype=action_dtype, rewards_dtype=rewards_dtype) + + def build_combined_memory( + self, state_dim, memory_size, sample_size, state_dtype, action_dim, + action_dtype, rewards_dtype + ): + assert memory_size > 0 + assert sample_size > 0 + return CombinedReplayMemory( + state_dim, size=memory_size, sample_size=sample_size, + state_dtype=state_dtype, action_dim=action_dim, + action_dtype=action_dtype, rewards_dtype=rewards_dtype) + + def build_online_memory( + self, state_dim, state_dtype, action_dtype, action_dim, rewards_dtype + ): + return OnlineReplayMemory( + state_dim, state_dtype=state_dtype, action_dim=action_dim, + action_dtype=action_dtype, rewards_dtype=rewards_dtype) + + +class ReplayMemory(object): + def __init__( + self, + state_dim, + sample_size, + size=1000, + action_dim=(1,), + state_dtype='float32', + action_dtype='uint8', + rewards_dtype='float32' + ): + assert size > 0, "Size must be greater than zero" + assert type(state_dim) is tuple, "State dimension must be a tuple" + assert type(action_dim) is tuple, "Action dimension must be a tuple" + assert sample_size > 0 + self._size = size + self._sample_size = sample_size + self._cur_size = 0 + self._pointer = 0 + self._state_dim = state_dim + self._state_dtype = state_dtype + self._action_dim = action_dim + self._action_dtype = action_dtype + self._rewards_dtype = rewards_dtype + self._states = np.zeros((self._size,) + state_dim, dtype=state_dtype) + self._actions = np.zeros( + (self._size,) + action_dim, dtype=action_dtype) + self._rewards = np.array([0] * self._size, dtype=rewards_dtype) + self._next_states = np.zeros( + (self._size,) + state_dim, dtype=state_dtype) + self._terminals = np.array([0] * self._size, dtype='bool') + + @property + def sample_size(self): + return self._sample_size + + def append(self, state, action, reward, next_state, terminal): + self._states[self._pointer] = state + self._actions[self._pointer] = action + self._rewards[self._pointer] = reward + self._next_states[self._pointer] = next_state + self._terminals[self._pointer] = terminal + + self._pointer = self._pointer + 1 + if self._pointer == self._size: + self._pointer = 0 + + self._cur_size = min(self._size, self._cur_size + 1) + + def at(self, index): + return self._states[index],\ + self._actions[index],\ + self._rewards[index],\ + self._next_states[index],\ + self._terminals[index] + + def is_sample_possible(self, batch_size=None): + batch_size = batch_size if batch_size is not None\ + else self._sample_size + return self._cur_size >= batch_size + + def sample(self, batch_size=None): + batch_size = batch_size if batch_size is not None\ + else self._sample_size + assert self._cur_size >= batch_size,\ + "Size of replay memory must be larger than batch size" + i = 0 + states = np.zeros(( + batch_size,)+self._state_dim, dtype=self._state_dtype) + actions = np.zeros( + (batch_size,)+self._action_dim, dtype=self._action_dtype) + rewards = np.zeros(batch_size, dtype=self._rewards_dtype) + next_states = np.zeros( + (batch_size,)+self._state_dim, dtype=self._state_dtype) + terminals = np.zeros(batch_size, dtype='bool') + + while i < batch_size: + rnd_index = np.random.randint(low=0, high=self._cur_size) + states[i] = self._states.take(rnd_index, axis=0) + actions[i] = self._actions.take(rnd_index, axis=0) + rewards[i] = self._rewards.take(rnd_index, axis=0) + next_states[i] = self._next_states.take(rnd_index, axis=0) + terminals[i] = self._terminals.take(rnd_index, axis=0) + i += 1 + + return states, actions, rewards, next_states, terminals + + +class OnlineReplayMemory(ReplayMemory): + def __init__( + self, state_dim, state_dtype='float32', action_dim=(1,), + action_dtype='uint8', rewards_dtype='float32' + ): + super(OnlineReplayMemory, self).__init__( + state_dim, sample_size=1, size=1, state_dtype=state_dtype, + action_dim=action_dim, action_dtype=action_dtype, + rewards_dtype=rewards_dtype) + + +class CombinedReplayMemory(ReplayMemory): + def __init__( + self, state_dim, sample_size, size=1000, state_dtype='float32', + action_dim=(1,), action_dtype='uint8', rewards_dtype='float32' + ): + super(CombinedReplayMemory, self).__init__( + state_dim=state_dim, sample_size=(sample_size - 1), size=size, + state_dtype=state_dtype, action_dim=action_dim, + action_dtype=action_dtype, rewards_dtype=rewards_dtype) + + self._last_state = np.zeros((1,) + state_dim, dtype=state_dtype) + self._last_action = np.array((1,) + action_dim, dtype=action_dtype) + self._last_reward = np.array([0], dtype=rewards_dtype) + self._last_next_state = np.zeros((1,) + state_dim, dtype=state_dtype) + self._last_terminal = np.array([0], dtype='bool') + + def append(self, state, action, reward, next_state, terminal): + super(CombinedReplayMemory, self).append( + state, action, reward, next_state, terminal) + self._last_state = state + self._last_action = action + self._last_reward = reward + self._last_next_state = next_state + self._last_terminal = terminal + + def sample(self, batch_size=None): + batch_size = (batch_size-1) if batch_size is not None\ + else self._sample_size + states, actions, rewards, next_states, terminals = super( + CombinedReplayMemory, self).sample(batch_size=batch_size) + states = np.append(states, [self._last_state], axis=0) + actions = np.append(actions, [self._last_action], axis=0) + rewards = np.append(rewards, [self._last_reward], axis=0) + next_states = np.append(next_states, [self._last_next_state], axis=0) + terminals = np.append(terminals, [self._last_terminal], axis=0) + return states, actions, rewards, next_states, terminals \ No newline at end of file diff --git a/src/test/resources/target_code/ros-ddpg/reinforcement_learning/strategy.py b/src/test/resources/target_code/ros-ddpg/reinforcement_learning/strategy.py new file mode 100644 index 0000000..4c47adc --- /dev/null +++ b/src/test/resources/target_code/ros-ddpg/reinforcement_learning/strategy.py @@ -0,0 +1,172 @@ +import numpy as np + + +class StrategyBuilder(object): + def __init__(self): + pass + + def build_by_params( + self, + method='epsgreedy', + epsilon=0.5, + min_epsilon=0.05, + epsilon_decay_method='no', + epsilon_decay=0.0, + epsilon_decay_start=0, + action_dim=None, + action_low=None, + action_high=None, + mu=0.0, + theta=0.5, + sigma=0.3 + ): + + if epsilon_decay_method == 'linear': + decay = LinearDecay( + eps_decay=epsilon_decay, min_eps=min_epsilon, + decay_start=epsilon_decay_start) + else: + decay = NoDecay() + + if method == 'epsgreedy': + assert action_dim is not None + assert len(action_dim) == 1 + return EpsilonGreedyStrategy( + eps=epsilon, number_of_actions=action_dim[0], + decay_method=decay) + elif method == 'ornstein_uhlenbeck': + assert action_dim is not None + assert action_low is not None + assert action_high is not None + assert mu is not None + assert theta is not None + assert sigma is not None + return OrnsteinUhlenbeckStrategy( + action_dim, action_low, action_high, epsilon, mu, theta, + sigma, decay) + else: + assert action_dim is not None + assert len(action_dim) == 1 + return GreedyStrategy() + + +class BaseDecay(object): + def __init__(self): + pass + + def decay(self, *args): + raise NotImplementedError + + def __call__(self, *args): + return self.decay(*args) + + +class NoDecay(BaseDecay): + def __init__(self): + super(NoDecay, self).__init__() + + def decay(self, cur_eps, episode): + return cur_eps + + +class LinearDecay(BaseDecay): + def __init__(self, eps_decay, min_eps=0, decay_start=0): + super(LinearDecay, self).__init__() + self.eps_decay = eps_decay + self.min_eps = min_eps + self.decay_start = decay_start + + def decay(self, cur_eps, episode): + if episode < self.decay_start: + return cur_eps + else: + return max(cur_eps - self.eps_decay, self.min_eps) + + +class BaseStrategy(object): + def __init__(self, decay_method): + self._decay_method = decay_method + + def select_action(self, values, decay_method): + raise NotImplementedError + + def decay(self, episode): + self.cur_eps = self._decay_method.decay(self.cur_eps, episode) + + def reset(self): + pass + + +class EpsilonGreedyStrategy(BaseStrategy): + def __init__(self, eps, number_of_actions, decay_method): + super(EpsilonGreedyStrategy, self).__init__(decay_method) + self.eps = eps + self.cur_eps = eps + self.__number_of_actions = number_of_actions + + def select_action(self, values): + do_exploration = (np.random.rand() < self.cur_eps) + if do_exploration: + action = np.random.randint(low=0, high=self.__number_of_actions) + else: + action = values.asnumpy().argmax() + return action + + +class GreedyStrategy(BaseStrategy): + def __init__(self): + super(GreedyStrategy, self).__init__(None) + + def select_action(self, values): + return values.asnumpy().argmax() + + def decay(self): + pass + + +class OrnsteinUhlenbeckStrategy(BaseStrategy): + """ + Ornstein-Uhlenbeck process: dxt = theta * (mu - xt) * dt + sigma * dWt + where Wt denotes the Wiener process. + """ + def __init__( + self, + action_dim, + action_low, + action_high, + eps, + mu=0.0, + theta=.15, + sigma=.3, + decay=NoDecay() + ): + super(OrnsteinUhlenbeckStrategy, self).__init__(decay) + self.eps = eps + self.cur_eps = eps + + self._decay_method = decay + + self._action_dim = action_dim + self._action_low = action_low + self._action_high = action_high + + self._mu = np.array(mu) + self._theta = np.array(theta) + self._sigma = np.array(sigma) + + self.state = np.ones(self._action_dim) * self._mu + + def _evolve_state(self): + x = self.state + dx = self._theta * (self._mu - x)\ + + self._sigma * np.random.randn(len(x)) + self.state = x + dx + return self.state + + def reset(self): + self.state = np.ones(self._action_dim) * self._mu + + def select_action(self, values): + noise = self._evolve_state() + action = values + (self.cur_eps * noise) + return np.clip(action, self._action_low, self._action_high) diff --git a/src/test/resources/target_code/ros-ddpg/reinforcement_learning/util.py b/src/test/resources/target_code/ros-ddpg/reinforcement_learning/util.py new file mode 100644 index 0000000..a1da1ce --- /dev/null +++ b/src/test/resources/target_code/ros-ddpg/reinforcement_learning/util.py @@ -0,0 +1,276 @@ +import signal +import sys +import numpy as np +import matplotlib.pyplot as plt +from matplotlib import style +import time +import os +import mxnet as mx +from mxnet import gluon, nd +import cnnarch_logger + +LOSS_FUNCTIONS = { + 'l1': gluon.loss.L1Loss(), + 'euclidean': gluon.loss.L2Loss(), + 'huber_loss': gluon.loss.HuberLoss(), + 'softmax_cross_entropy': gluon.loss.SoftmaxCrossEntropyLoss(), + 'sigmoid_cross_entropy': gluon.loss.SigmoidBinaryCrossEntropyLoss()} + + +def make_directory_if_not_exist(directory): + if not os.path.exists(directory): + os.makedirs(directory) + + +def copy_net(net, input_state_dim, ctx): + assert isinstance(net, gluon.HybridBlock) + assert type(net.__class__) is type + + net2 = net.__class__() + net2.collect_params().initialize(mx.init.Zero(), ctx=ctx) + net2.hybridize() + net2(mx.nd.ones((1,) + input_state_dim, ctx=ctx)) + + params_of_net = [p.data() for _, p in net.collect_params().items()] + for i, (_, p) in enumerate(net2.collect_params().items()): + p.set_data(params_of_net[i]) + + return net2 + + +def copy_net_with_two_inputs(net, input_state_dim1, input_state_dim2, ctx): + assert isinstance(net, gluon.HybridBlock) + assert type(net.__class__) is type + + net2 = net.__class__() + net2.collect_params().initialize(mx.init.Zero(), ctx=ctx) + net2.hybridize() + net2( + nd.ones((1,) + input_state_dim1, ctx=ctx), + nd.ones((1,) + input_state_dim2, ctx=ctx)) + + params_of_net = [p.data() for _, p in net.collect_params().items()] + for i, (_, p) in enumerate(net2.collect_params().items()): + p.set_data(params_of_net[i]) + + return net2 + + +def get_loss_function(loss_function_name): + if loss_function_name not in LOSS_FUNCTIONS: + raise ValueError('Loss function does not exist') + return LOSS_FUNCTIONS[loss_function_name] + + +class AgentSignalHandler(object): + def __init__(self): + signal.signal(signal.SIGINT, self.interrupt_training) + self.__agent = None + self.__times_interrupted = 0 + + def register_agent(self, agent): + self.__agent = agent + + def interrupt_training(self, sig, frame): + self.__times_interrupted = self.__times_interrupted + 1 + if self.__times_interrupted <= 3: + if self.__agent: + self.__agent.set_interrupt_flag(True) + else: + print('Interrupt called three times: Force quit') + sys.exit(1) + +style.use('fivethirtyeight') + + +class TrainingStats(object): + def __init__(self, max_episodes): + self._logger = cnnarch_logger.ArchLogger.get_logger() + self._max_episodes = max_episodes + self._all_total_rewards = np.zeros((max_episodes,)) + self._all_eps = np.zeros((max_episodes,)) + self._all_time = np.zeros((max_episodes,)) + self._all_mean_reward_last_100_episodes = np.zeros((max_episodes,)) + + @property + def logger(self): + return self._logger + + @logger.setter + def logger(self, logger): + self._logger = logger + + @logger.deleter + def logger(self): + self._logger = None + + def add_total_reward(self, episode, total_reward): + self._all_total_rewards[episode] = total_reward + + def add_eps(self, episode, eps): + self._all_eps[episode] = eps + + def add_time(self, episode, time): + self._all_time[episode] = time + + def add_mean_reward_last_100(self, episode, mean_reward): + self._all_mean_reward_last_100_episodes[episode] = mean_reward + + def log_episode(self, *args): + raise NotImplementedError + + def mean_of_reward(self, cur_episode, last=100): + if cur_episode > 0: + reward_last_100 =\ + self._all_total_rewards[max(0, cur_episode-last):cur_episode] + return np.mean(reward_last_100) + else: + return self._all_total_rewards[0] + + def save(self, path): + np.save(os.path.join(path, 'total_rewards'), self._all_total_rewards) + np.save(os.path.join(path, 'eps'), self._all_eps) + np.save(os.path.join(path, 'time'), self._all_time) + np.save( + os.path.join(path, 'mean_reward'), + self._all_mean_reward_last_100_episodes) + + def _log_episode(self, episode, start_time, training_steps, eps, reward): + self.add_eps(episode, eps) + self.add_total_reward(episode, reward) + end = time.time() + mean_reward_last_100 = self.mean_of_reward(episode, last=100) + time_elapsed = end - start_time + self.add_time(episode, time_elapsed) + self.add_mean_reward_last_100(episode, mean_reward_last_100) + return ('Episode: %d, Total Reward: %.3f, ' + 'Avg. Reward Last 100 Episodes: %.3f, {}, ' + 'Time: %.3f, Training Steps: %d, Eps: %.3f') % ( + episode, reward, mean_reward_last_100, time_elapsed, + training_steps, eps), mean_reward_last_100 + + +class DqnTrainingStats(TrainingStats): + def __init__(self, max_episodes): + super(DqnTrainingStats, self).__init__(max_episodes) + self._all_avg_loss = np.zeros((max_episodes,)) + + def add_avg_loss(self, episode, avg_loss): + self._all_avg_loss[episode] = avg_loss + + def log_episode( + self, episode, start_time, training_steps, avg_loss, eps, reward + ): + self.add_avg_loss(episode, avg_loss) + + info, avg_reward = self._log_episode( + episode, start_time, training_steps, eps, reward) + info = info.format(('Avg. Loss: %.3f') % (avg_loss)) + + self._logger.info(info) + return avg_reward + + def save_stats(self, path): + fig = plt.figure(figsize=(20, 20)) + + sub_rewards = fig.add_subplot(221) + sub_rewards.set_title('Total Rewards per episode') + sub_rewards.plot( + np.arange(self._max_episodes), self._all_total_rewards) + + sub_loss = fig.add_subplot(222) + sub_loss.set_title('Avg. Loss per episode') + sub_loss.plot(np.arange(self._max_episodes), self._all_avg_loss) + + sub_eps = fig.add_subplot(223) + sub_eps.set_title('Epsilon per episode') + sub_eps.plot(np.arange(self._max_episodes), self._all_eps) + + sub_rewards = fig.add_subplot(224) + sub_rewards.set_title('Avg. mean reward of last 100 episodes') + sub_rewards.plot(np.arange(self._max_episodes), + self._all_mean_reward_last_100_episodes) + + self.save(path) + plt.savefig(os.path.join(path, 'stats.pdf')) + + def save(self, path): + super(DqnTrainingStats, self).save(path) + np.save(os.path.join(path, 'avg_loss'), self._all_avg_loss) + + +class DdpgTrainingStats(TrainingStats): + def __init__(self, max_episodes): + super(DdpgTrainingStats, self).__init__(max_episodes) + self._all_avg_critic_loss = np.zeros((max_episodes,)) + self._all_avg_actor_loss = np.zeros((max_episodes,)) + self._all_avg_qvalues = np.zeros((max_episodes,)) + + def add_avg_critic_loss(self, episode, avg_critic_loss): + self._all_avg_critic_loss[episode] = avg_critic_loss + + def add_avg_actor_loss(self, episode, avg_actor_loss): + self._all_avg_actor_loss[episode] = avg_actor_loss + + def add_avg_qvalues(self, episode, avg_qvalues): + self._all_avg_qvalues[episode] = avg_qvalues + + def log_episode( + self, episode, start_time, training_steps, actor_loss, + critic_loss, qvalues, eps, reward + ): + self.add_avg_actor_loss(episode, actor_loss) + self.add_avg_critic_loss(episode, critic_loss) + self.add_avg_qvalues(episode, qvalues) + + info, avg_reward = self._log_episode( + episode, start_time, training_steps, eps, reward) + info = info.format(( + 'Avg. Actor Loss: %.3f ' + 'Avg. Critic Loss: %.3f ' + 'Avg. Q-Values: %.3f') % (actor_loss, critic_loss, qvalues)) + + self.logger.info(info) + return avg_reward + + def save(self, path): + super(DdpgTrainingStats, self).save(path) + np.save(os.path.join( + path, 'avg_critic_loss'), self._all_avg_critic_loss) + np.save(os.path.join(path, 'avg_actor_loss'), self._all_avg_actor_loss) + np.save(os.path.join(path, 'avg_qvalues'), self._all_avg_qvalues) + + def save_stats(self, path): + fig = plt.figure(figsize=(120, 120)) + + sub_rewards = fig.add_subplot(321) + sub_rewards.set_title('Total Rewards per episode') + sub_rewards.plot( + np.arange(self._max_episodes), self._all_total_rewards) + + sub_actor_loss = fig.add_subplot(322) + sub_actor_loss.set_title('Avg. Actor Loss per episode') + sub_actor_loss.plot( + np.arange(self._max_episodes), self._all_avg_actor_loss) + + sub_critic_loss = fig.add_subplot(323) + sub_critic_loss.set_title('Avg. Critic Loss per episode') + sub_critic_loss.plot( + np.arange(self._max_episodes), self._all_avg_critic_loss) + + sub_qvalues = fig.add_subplot(324) + sub_qvalues.set_title('Avg. QValues per episode') + sub_qvalues.plot( + np.arange(self._max_episodes), self._all_avg_qvalues) + + sub_eps = fig.add_subplot(325) + sub_eps.set_title('Epsilon per episode') + sub_eps.plot(np.arange(self._max_episodes), self._all_eps) + + sub_rewards = fig.add_subplot(326) + sub_rewards.set_title('Avg. mean reward of last 100 episodes') + sub_rewards.plot(np.arange(self._max_episodes), + self._all_mean_reward_last_100_episodes) + + self.save(path) + plt.savefig(os.path.join(path, 'stats.pdf')) diff --git a/src/test/resources/target_code/ros-ddpg/start_training.sh b/src/test/resources/target_code/ros-ddpg/start_training.sh new file mode 100644 index 0000000..6940edb --- /dev/null +++ b/src/test/resources/target_code/ros-ddpg/start_training.sh @@ -0,0 +1,2 @@ +#!/bin/bash +python CNNTrainer_rosActorNetwork.py \ No newline at end of file diff --git a/src/test/resources/valid_tests/ReinforcementConfig1.cnnt b/src/test/resources/valid_tests/ReinforcementConfig1.cnnt index 101d988..86d471c 100644 --- a/src/test/resources/valid_tests/ReinforcementConfig1.cnnt +++ b/src/test/resources/valid_tests/ReinforcementConfig1.cnnt @@ -31,7 +31,7 @@ configuration ReinforcementConfig1 { sample_size : 64 } - action_selection : epsgreedy{ + strategy : epsgreedy{ epsilon : 1.0 min_epsilon : 0.02 epsilon_decay_method: linear diff --git a/src/test/resources/valid_tests/ReinforcementConfig2.cnnt b/src/test/resources/valid_tests/ReinforcementConfig2.cnnt index 61deb55..8bb853d 100644 --- a/src/test/resources/valid_tests/ReinforcementConfig2.cnnt +++ b/src/test/resources/valid_tests/ReinforcementConfig2.cnnt @@ -15,6 +15,8 @@ configuration ReinforcementConfig2 { snapshot_interval : 20 + evaluation_samples: 100 + use_double_dqn : false loss : euclidean @@ -24,10 +26,11 @@ configuration ReinforcementConfig2 { sample_size : 32 } - action_selection : epsgreedy{ + strategy : epsgreedy{ epsilon : 1.0 min_epsilon : 0.01 epsilon_decay_method: linear + epsilon_decay_start: 20 epsilon_decay : 0.0001 } diff --git a/src/test/resources/valid_tests/ReinforcementConfig3.cnnt b/src/test/resources/valid_tests/ReinforcementConfig3.cnnt new file mode 100644 index 0000000..8d2bed6 --- /dev/null +++ b/src/test/resources/valid_tests/ReinforcementConfig3.cnnt @@ -0,0 +1,44 @@ +configuration ReinforcementConfig3 { + learning_method : reinforcement + + environment : ros_interface { + state_topic : "/environment/state" + action_topic : "/environment/action" + reset_topic : "/environment/reset" + reward_topic: "/environment/reward" + } + + agent_name : "reinforcement_agent" + + num_episodes : 1000 + target_score : 35000 + discount_factor : 0.99999 + num_max_steps : 10000 + training_interval : 1 + + use_fix_target_network : true + target_network_update_interval : 500 + + snapshot_interval : 500 + + use_double_dqn : true + + loss : huber_loss + + replay_memory : buffer{ + memory_size : 1000000 + sample_size : 64 + } + + strategy : epsgreedy{ + epsilon : 1.0 + min_epsilon : 0.02 + epsilon_decay_method: linear + epsilon_decay : 0.0001 + } + + optimizer : adam{ + learning_rate : 0.001 + } + +} \ No newline at end of file diff --git a/src/test/resources/valid_tests/ddpg-ros/RosActorNetwork.cnnt b/src/test/resources/valid_tests/ddpg-ros/RosActorNetwork.cnnt new file mode 100644 index 0000000..074f312 --- /dev/null +++ b/src/test/resources/valid_tests/ddpg-ros/RosActorNetwork.cnnt @@ -0,0 +1,66 @@ +configuration RosActorNetwork { + learning_method : reinforcement + + rl_algorithm : ddpg-algorithm + + critic : comp.rosCriticNetwork + + environment : ros_interface { + state_topic : "/environment/state" + action_topic : "/environment/action" + reset_topic : "/environment/reset" + } + + reward_function : reward.rewardFunction + + agent_name : "ddpg-agent" + + num_episodes : 2500 + discount_factor : 0.9998 + num_max_steps : 2000 + training_interval : 1 + + start_training_at: 50 + + soft_target_update_rate: 0.001 + + snapshot_interval: 500 + evaluation_samples: 1000 + + replay_memory : buffer{ + memory_size : 1500000 + sample_size : 128 + } + + strategy : ornstein_uhlenbeck{ + epsilon: 1.0 + min_epsilon: 0.001 + epsilon_decay_method: linear + epsilon_decay : 0.0001 + epsilon_decay_start: 50 + mu: (0.0, 0.1, 0.3) + theta: (0.5, 0.0, 0.8) + sigma: (0.3, 0.6, -0.9) + } + + actor_optimizer : adam{ + learning_rate : 0.001 + learning_rate_minimum : 0.00001 + } + + critic_optimizer : rmsprop{ + learning_rate : 0.001 + learning_rate_minimum : 0.00001 + weight_decay : 0.01 + learning_rate_decay : 0.9 + learning_rate_policy : step + step_size : 1000 + rescale_grad : 1.1 + clip_gradient : 10 + gamma1 : 0.9 + gamma2 : 0.9 + epsilon : 0.000001 + centered : true + clip_weights : 10 + } +} \ No newline at end of file diff --git a/src/test/resources/valid_tests/ddpg-ros/comp/RosCriticNetwork.cnna b/src/test/resources/valid_tests/ddpg-ros/comp/RosCriticNetwork.cnna new file mode 100644 index 0000000..8c9363e --- /dev/null +++ b/src/test/resources/valid_tests/ddpg-ros/comp/RosCriticNetwork.cnna @@ -0,0 +1,14 @@ +implementation Critic(state, action) { + (state -> + FullyConnected(units=300) -> + Relu() -> + FullyConnected(units=600) + | + + action -> + FullyConnected(units=600) + ) -> + Add() -> + FullyConnected(units=600) -> + Relu() +} \ No newline at end of file diff --git a/src/test/resources/valid_tests/ddpg-ros/reward/RewardFunction.emadl b/src/test/resources/valid_tests/ddpg-ros/reward/RewardFunction.emadl new file mode 100644 index 0000000..0e9b97d --- /dev/null +++ b/src/test/resources/valid_tests/ddpg-ros/reward/RewardFunction.emadl @@ -0,0 +1,15 @@ +package reward; + +component RewardFunction { + ports + in Q^{16} state, + in B isTerminal, + out Q reward; + + implementation Math { + Q speed = state(15); + Q angle = state(1); + + reward = speed * cos(angle); + } +} \ No newline at end of file diff --git a/src/test/resources/valid_tests/ddpg/ActorNetwork.cnnt b/src/test/resources/valid_tests/ddpg/ActorNetwork.cnnt new file mode 100644 index 0000000..85cf93e --- /dev/null +++ b/src/test/resources/valid_tests/ddpg/ActorNetwork.cnnt @@ -0,0 +1,61 @@ +configuration ActorNetwork { + learning_method : reinforcement + + rl_algorithm : ddpg-algorithm + + critic : comp.criticNetwork + + environment : gym { name : "CartPole-v0" } + + agent_name : "ddpg-agent" + + num_episodes : 1000 + discount_factor : 0.99999 + target_score : 185.5 + num_max_steps : 250 + training_interval : 1 + + start_training_at: 100 + + soft_target_update_rate: 0.001 + + snapshot_interval: 50 + evaluation_samples: 100 + + replay_memory : buffer{ + memory_size : 1000000 + sample_size : 64 + } + + strategy : ornstein_uhlenbeck{ + epsilon: 1.0 + min_epsilon: 0.001 + epsilon_decay_method: linear + epsilon_decay : 0.0001 + epsilon_decay_start: 50 + mu: (0.0, 0.1, 0.3) + theta: (0.5, 0.0, 0.8) + sigma: (0.3, 0.6, -0.9) + } + + actor_optimizer : adam{ + learning_rate : 0.001 + learning_rate_minimum : 0.00001 + } + + critic_optimizer : rmsprop{ + learning_rate : 0.001 + learning_rate_minimum : 0.00001 + weight_decay : 0.01 + learning_rate_decay : 0.9 + learning_rate_policy : step + step_size : 1000 + rescale_grad : 1.1 + clip_gradient : 10 + gamma1 : 0.9 + gamma2 : 0.9 + epsilon : 0.000001 + centered : true + clip_weights : 10 + } +} \ No newline at end of file diff --git a/src/test/resources/valid_tests/ddpg/comp/CriticNetwork.cnna b/src/test/resources/valid_tests/ddpg/comp/CriticNetwork.cnna new file mode 100644 index 0000000..8c9363e --- /dev/null +++ b/src/test/resources/valid_tests/ddpg/comp/CriticNetwork.cnna @@ -0,0 +1,14 @@ +implementation Critic(state, action) { + (state -> + FullyConnected(units=300) -> + Relu() -> + FullyConnected(units=600) + | + + action -> + FullyConnected(units=600) + ) -> + Add() -> + FullyConnected(units=600) -> + Relu() +} \ No newline at end of file -- GitLab