diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 575941b3070abd976c1470ec6e41e6523befbfb6..a45cd7c87301d8847cb1237464de25df85737f9b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -27,7 +27,7 @@ masterJobLinux: stage: linux image: maven:3-jdk-8 script: - - mvn -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn -B clean deploy --settings settings.xml -Dtest=\!Integration* + - mvn -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn -B clean deploy --settings settings.xml - cat target/site/jacoco/index.html - mvn package sonar:sonar -s settings.xml only: @@ -36,7 +36,7 @@ masterJobLinux: masterJobWindows: stage: windows script: - - mvn -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn -B clean install --settings settings.xml -Dtest=\!Integration* + - mvn -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn -B clean install --settings settings.xml tags: - Windows10 @@ -44,13 +44,7 @@ BranchJobLinux: stage: linux image: maven:3-jdk-8 script: - - mvn -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn -B clean install --settings settings.xml -Dtest=\!Integration* + - mvn -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn -B clean install --settings settings.xml - cat target/site/jacoco/index.html except: - master - -PythonWrapperIntegrationTest: - stage: linux - image: registry.git.rwth-aachen.de/monticore/embeddedmontiarc/generators/emadl2pythonwrapper/tests/mvn-swig:latest - script: - - mvn -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn -B clean install --settings settings.xml -Dtest=IntegrationPythonWrapperTest diff --git a/pom.xml b/pom.xml index 6ad0f60f7f5e23ab07f560000231d89d2bfd1a80..677cd81aa359354a9d4eb2790e514be367ae5761 100644 --- a/pom.xml +++ b/pom.xml @@ -8,7 +8,7 @@ <groupId>de.monticore.lang.monticar</groupId> <artifactId>cnnarch-gluon-generator</artifactId> - <version>0.2.2-SNAPSHOT</version> + <version>0.2.6-SNAPSHOT</version> <!-- == PROJECT DEPENDENCIES ============================================= --> @@ -16,10 +16,10 @@ <!-- .. SE-Libraries .................................................. --> <CNNArch.version>0.3.1-SNAPSHOT</CNNArch.version> - <CNNTrain.version>0.3.4-SNAPSHOT</CNNTrain.version> + <CNNTrain.version>0.3.6-SNAPSHOT</CNNTrain.version> <CNNArch2X.version>0.0.2-SNAPSHOT</CNNArch2X.version> <embedded-montiarc-math-opt-generator>0.1.4</embedded-montiarc-math-opt-generator> - <EMADL2PythonWrapper.version>0.0.1</EMADL2PythonWrapper.version> + <EMADL2PythonWrapper.version>0.0.2-SNAPSHOT</EMADL2PythonWrapper.version> <!-- .. Libraries .................................................. --> <guava.version>18.0</guava.version> diff --git a/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/CNNArch2GluonCli.java b/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/CNNArch2GluonCli.java index d16e68cd28e7b04f77fff79a06f67da9ca929531..b16a14766cb32edc42a4a7dbc9c69747d98094dd 100644 --- a/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/CNNArch2GluonCli.java +++ b/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/CNNArch2GluonCli.java @@ -29,4 +29,4 @@ public class CNNArch2GluonCli { GenericCNNArchCli cli = new GenericCNNArchCli(generator); cli.run(args); } -} +} \ No newline at end of file diff --git a/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/CNNTrain2Gluon.java b/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/CNNTrain2Gluon.java index d460b609f785f09b27952da5a0af71e726442970..e358c2c357814ddb3277d5a65109b706c022f2eb 100644 --- a/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/CNNTrain2Gluon.java +++ b/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/CNNTrain2Gluon.java @@ -1,8 +1,9 @@ package de.monticore.lang.monticar.cnnarch.gluongenerator; import com.google.common.collect.Maps; -import de.monticore.lang.monticar.cnnarch.gluongenerator.reinforcement.critic.CriticNetworkGenerationPair; -import de.monticore.lang.monticar.cnnarch.gluongenerator.reinforcement.critic.CriticNetworkGenerator; +import de.monticore.lang.embeddedmontiarc.embeddedmontiarc._symboltable.instanceStructure.EMAComponentInstanceSymbol; +import de.monticore.lang.monticar.cnnarch._symboltable.ArchitectureSymbol; +import de.monticore.lang.monticar.cnnarch.gluongenerator.annotations.ArchitectureAdapter; import de.monticore.lang.monticar.cnnarch.gluongenerator.reinforcement.FunctionParameterChecker; import de.monticore.lang.monticar.cnnarch.gluongenerator.reinforcement.RewardFunctionParameterAdapter; import de.monticore.lang.monticar.cnnarch.gluongenerator.reinforcement.RewardFunctionSourceGenerator; @@ -10,15 +11,12 @@ import de.monticore.lang.monticar.cnnarch.generator.ConfigurationData; import de.monticore.lang.monticar.cnnarch.generator.CNNTrainGenerator; import de.monticore.lang.monticar.cnnarch.generator.TemplateConfiguration; -import de.monticore.lang.monticar.cnntrain._symboltable.ConfigurationSymbol; -import de.monticore.lang.monticar.cnntrain._symboltable.LearningMethod; -import de.monticore.lang.monticar.cnntrain._symboltable.RLAlgorithm; -import de.monticore.lang.monticar.cnntrain._symboltable.RewardFunctionSymbol; -import de.monticore.lang.monticar.cnntrain.annotations.TrainedArchitecture; +import de.monticore.lang.monticar.cnntrain._symboltable.*; import de.monticore.lang.monticar.generator.FileContent; import de.monticore.lang.monticar.generator.cpp.GeneratorCPP; import de.monticore.lang.monticar.generator.pythonwrapper.GeneratorPythonWrapperStandaloneApi; import de.monticore.lang.monticar.generator.pythonwrapper.symbolservices.data.ComponentPortInformation; +import de.monticore.lang.tagging._symboltable.TaggingResolver; import de.se_rwth.commons.logging.Log; import java.io.File; @@ -54,13 +52,6 @@ public class CNNTrain2Gluon extends CNNTrainGenerator { @Override public ConfigurationSymbol getConfigurationSymbol(Path modelsDirPath, String rootModelName) { ConfigurationSymbol configurationSymbol = super.getConfigurationSymbol(modelsDirPath, rootModelName); - - // Generate Reward function if necessary - if (configurationSymbol.getLearningMethod().equals(LearningMethod.REINFORCEMENT) - && configurationSymbol.getRlRewardFunction().isPresent()) { - generateRewardFunction(configurationSymbol.getRlRewardFunction().get(), modelsDirPath); - } - return configurationSymbol; } @@ -93,17 +84,25 @@ public class CNNTrain2Gluon extends CNNTrainGenerator { } } - public void generate(Path modelsDirPath, String rootModelName, TrainedArchitecture trainedArchitecture) { + public void generate(Path modelsDirPath, + String rootModelName, + NNArchitectureSymbol trainedArchitecture, + NNArchitectureSymbol criticNetwork) { ConfigurationSymbol configurationSymbol = this.getConfigurationSymbol(modelsDirPath, rootModelName); configurationSymbol.setTrainedArchitecture(trainedArchitecture); + configurationSymbol.setCriticNetwork(criticNetwork); this.setRootProjectModelsDir(modelsDirPath.toString()); generateFilesFromConfigurationSymbol(configurationSymbol); } + public void generate(Path modelsDirPath, String rootModelName, NNArchitectureSymbol trainedArchitecture) { + generate(modelsDirPath, rootModelName, trainedArchitecture, null); + } + @Override public Map<String, String> generateStrings(ConfigurationSymbol configuration) { TemplateConfiguration templateConfiguration = new GluonTemplateConfiguration(); - ReinforcementConfigurationData configData = new ReinforcementConfigurationData(configuration, getInstanceName()); + GluonConfigurationData configData = new GluonConfigurationData(configuration, getInstanceName()); List<ConfigurationData> configDataList = new ArrayList<>(); configDataList.add(configData); @@ -119,24 +118,39 @@ public class CNNTrain2Gluon extends CNNTrainGenerator { final String trainerName = "CNNTrainer_" + getInstanceName(); final RLAlgorithm rlAlgorithm = configData.getRlAlgorithm(); - if (rlAlgorithm.equals(RLAlgorithm.DDPG)) { - CriticNetworkGenerator criticNetworkGenerator = new CriticNetworkGenerator(); - criticNetworkGenerator.setGenerationTargetPath( - Paths.get(getGenerationTargetPath(), REINFORCEMENT_LEARNING_FRAMEWORK_MODULE).toString()); - if (getRootProjectModelsDir().isPresent()) { - criticNetworkGenerator.setRootModelsDir(getRootProjectModelsDir().get()); - } else { - Log.error("No root model dir set"); + if (rlAlgorithm.equals(RLAlgorithm.DDPG) + || rlAlgorithm.equals(RLAlgorithm.TD3)) { + + if (!configuration.getCriticNetwork().isPresent()) { + Log.error("No architecture model for critic available but is required for chosen " + + "actor-critic algorithm"); } + NNArchitectureSymbol genericArchitectureSymbol = configuration.getCriticNetwork().get(); + ArchitectureSymbol architectureSymbol + = ((ArchitectureAdapter)genericArchitectureSymbol).getArchitectureSymbol(); - CriticNetworkGenerationPair criticNetworkResult - = criticNetworkGenerator.generateCriticNetworkContent(templateConfiguration, configuration); + CNNArch2Gluon gluonGenerator = new CNNArch2Gluon(); + gluonGenerator.setGenerationTargetPath( + Paths.get(getGenerationTargetPath(), REINFORCEMENT_LEARNING_FRAMEWORK_MODULE).toString()); + Map<String, String> architectureFileContentMap + = gluonGenerator.generateStringsAllowMultipleIO(architectureSymbol, true); - fileContentMap.putAll(criticNetworkResult.getFileContent().entrySet().stream().collect(Collectors.toMap( + + final String creatorName = architectureFileContentMap.keySet().iterator().next(); + final String criticInstanceName = creatorName.substring( + creatorName.indexOf('_') + 1, creatorName.lastIndexOf(".py")); + + fileContentMap.putAll(architectureFileContentMap.entrySet().stream().collect(Collectors.toMap( k -> REINFORCEMENT_LEARNING_FRAMEWORK_MODULE + "/" + k.getKey(), Map.Entry::getValue)) ); - ftlContext.put("criticInstanceName", criticNetworkResult.getCriticNetworkName()); + + ftlContext.put("criticInstanceName", criticInstanceName); + } + + // Generate Reward function if necessary + if (configuration.getRlRewardFunction().isPresent()) { + generateRewardFunction(configuration.getRlRewardFunction().get(), Paths.get(rootProjectModelsDir)); } ftlContext.put("trainerName", trainerName); @@ -164,8 +178,11 @@ public class CNNTrain2Gluon extends CNNTrainGenerator { setRootProjectModelsDir(modelsDirPath.toString()); } - rewardFunctionSourceGenerator.generate(getRootProjectModelsDir().get(), - rewardFunctionRootModel, rewardFunctionOutputPath); + final TaggingResolver taggingResolver + = rewardFunctionSourceGenerator.createTaggingResolver(getRootProjectModelsDir().get()); + final EMAComponentInstanceSymbol emaSymbol + = rewardFunctionSourceGenerator.resolveSymbol(taggingResolver, rewardFunctionRootModel); + rewardFunctionSourceGenerator.generate(emaSymbol, taggingResolver, rewardFunctionOutputPath); fixArmadilloEmamGenerationOfFile(Paths.get(rewardFunctionOutputPath, String.join("_", fullNameOfComponent) + ".h")); String pythonWrapperOutputPath = Paths.get(rewardFunctionOutputPath, "pylib").toString(); @@ -175,12 +192,11 @@ public class CNNTrain2Gluon extends CNNTrainGenerator { if (pythonWrapperApi.checkIfPythonModuleBuildAvailable()) { final String rewardModuleOutput = Paths.get(getGenerationTargetPath(), REINFORCEMENT_LEARNING_FRAMEWORK_MODULE).toString(); - componentPortInformation = pythonWrapperApi.generateAndTryBuilding(getRootProjectModelsDir().get(), - rewardFunctionRootModel, pythonWrapperOutputPath, rewardModuleOutput); + componentPortInformation = pythonWrapperApi.generateAndTryBuilding(emaSymbol, + pythonWrapperOutputPath, rewardModuleOutput); } else { Log.warn("Cannot build wrapper automatically: OS not supported. Please build manually before starting training."); - componentPortInformation = pythonWrapperApi.generate(getRootProjectModelsDir().get(), rewardFunctionRootModel, - pythonWrapperOutputPath); + componentPortInformation = pythonWrapperApi.generate(emaSymbol, pythonWrapperOutputPath); } RewardFunctionParameterAdapter functionParameter = new RewardFunctionParameterAdapter(componentPortInformation); new FunctionParameterChecker().check(functionParameter); diff --git a/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/ReinforcementConfigurationData.java b/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/GluonConfigurationData.java similarity index 61% rename from src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/ReinforcementConfigurationData.java rename to src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/GluonConfigurationData.java index 3b2c2847ae5e7a99bb29221c3e7c72fb45c4de70..a40a38f527c5fd6fea29e44f5dc96100d2be3f02 100644 --- a/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/ReinforcementConfigurationData.java +++ b/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/GluonConfigurationData.java @@ -1,138 +1,125 @@ package de.monticore.lang.monticar.cnnarch.gluongenerator; -import de.monticore.lang.monticar.cnnarch._symboltable.ArchitectureSymbol; import de.monticore.lang.monticar.cnnarch.gluongenerator.reinforcement.RewardFunctionParameterAdapter; import de.monticore.lang.monticar.cnnarch.generator.ConfigurationData; import de.monticore.lang.monticar.cnntrain._symboltable.*; import de.monticore.lang.monticar.cnntrain.annotations.Range; -import de.monticore.lang.monticar.cnntrain.annotations.TrainedArchitecture; + +import static de.monticore.lang.monticar.cnntrain.helper.ConfigEntryNameConstants.*; import java.util.*; -public class ReinforcementConfigurationData extends ConfigurationData { - private static final String AST_ENTRY_LEARNING_METHOD = "learning_method"; - private static final String AST_ENTRY_NUM_EPISODES = "num_episodes"; - private static final String AST_ENTRY_DISCOUNT_FACTOR = "discount_factor"; - private static final String AST_ENTRY_NUM_MAX_STEPS = "num_max_steps"; - private static final String AST_ENTRY_TARGET_SCORE = "target_score"; - private static final String AST_ENTRY_TRAINING_INTERVAL = "training_interval"; - private static final String AST_ENTRY_USE_FIX_TARGET_NETWORK = "use_fix_target_network"; - private static final String AST_ENTRY_TARGET_NETWORK_UPDATE_INTERVAL = "target_network_update_interval"; - private static final String AST_ENTRY_SNAPSHOT_INTERVAL = "snapshot_interval"; - private static final String AST_ENTRY_AGENT_NAME = "agent_name"; - private static final String AST_ENTRY_USE_DOUBLE_DQN = "use_double_dqn"; - private static final String AST_ENTRY_LOSS = "loss"; - private static final String AST_ENTRY_RL_ALGORITHM = "rl_algorithm"; - private static final String AST_ENTRY_REPLAY_MEMORY = "replay_memory"; - private static final String AST_ENTRY_STRATEGY = "strategy"; - private static final String AST_ENTRY_ENVIRONMENT = "environment"; - private static final String AST_ENTRY_START_TRAINING_AT = "start_training_at"; - private static final String AST_SOFT_TARGET_UPDATE_RATE = "soft_target_update_rate"; - private static final String AST_EVALUATION_SAMPLES = "evaluation_samples"; - - private static final String ENVIRONMENT_PARAM_REWARD_TOPIC = "reward_topic"; - private static final String ENVIRONMENT_ROS = "ros_interface"; - private static final String ENVIRONMENT_GYM = "gym"; - - private static final String STRATEGY_ORNSTEIN_UHLENBECK = "ornstein_uhlenbeck"; - - public ReinforcementConfigurationData(ConfigurationSymbol configuration, String instanceName) { +public class GluonConfigurationData extends ConfigurationData { + public GluonConfigurationData(ConfigurationSymbol configuration, String instanceName) { super(configuration, instanceName); } public Boolean isSupervisedLearning() { - if (configurationContainsKey(AST_ENTRY_LEARNING_METHOD)) { - return retrieveConfigurationEntryValueByKey(AST_ENTRY_LEARNING_METHOD) + if (configurationContainsKey(LEARNING_METHOD)) { + return retrieveConfigurationEntryValueByKey(LEARNING_METHOD) .equals(LearningMethod.SUPERVISED); } return true; } public Boolean isReinforcementLearning() { - return configurationContainsKey(AST_ENTRY_LEARNING_METHOD) - && retrieveConfigurationEntryValueByKey(AST_ENTRY_LEARNING_METHOD).equals(LearningMethod.REINFORCEMENT); + return configurationContainsKey(LEARNING_METHOD) + && retrieveConfigurationEntryValueByKey(LEARNING_METHOD).equals(LearningMethod.REINFORCEMENT); } public Integer getNumEpisodes() { - return !configurationContainsKey(AST_ENTRY_NUM_EPISODES) - ? null : (Integer)retrieveConfigurationEntryValueByKey(AST_ENTRY_NUM_EPISODES); + return !configurationContainsKey(NUM_EPISODES) + ? null : (Integer)retrieveConfigurationEntryValueByKey(NUM_EPISODES); } public Double getDiscountFactor() { - return !configurationContainsKey(AST_ENTRY_DISCOUNT_FACTOR) - ? null : (Double)retrieveConfigurationEntryValueByKey(AST_ENTRY_DISCOUNT_FACTOR); + return !configurationContainsKey(DISCOUNT_FACTOR) + ? null : (Double)retrieveConfigurationEntryValueByKey(DISCOUNT_FACTOR); } public Integer getNumMaxSteps() { - return !configurationContainsKey(AST_ENTRY_NUM_MAX_STEPS) - ? null : (Integer)retrieveConfigurationEntryValueByKey(AST_ENTRY_NUM_MAX_STEPS); + return !configurationContainsKey(NUM_MAX_STEPS) + ? null : (Integer)retrieveConfigurationEntryValueByKey(NUM_MAX_STEPS); } public Double getTargetScore() { - return !configurationContainsKey(AST_ENTRY_TARGET_SCORE) - ? null : (Double)retrieveConfigurationEntryValueByKey(AST_ENTRY_TARGET_SCORE); + return !configurationContainsKey(TARGET_SCORE) + ? null : (Double)retrieveConfigurationEntryValueByKey(TARGET_SCORE); } public Integer getTrainingInterval() { - return !configurationContainsKey(AST_ENTRY_TRAINING_INTERVAL) - ? null : (Integer)retrieveConfigurationEntryValueByKey(AST_ENTRY_TRAINING_INTERVAL); + return !configurationContainsKey(TRAINING_INTERVAL) + ? null : (Integer)retrieveConfigurationEntryValueByKey(TRAINING_INTERVAL); } public Boolean getUseFixTargetNetwork() { - return !configurationContainsKey(AST_ENTRY_USE_FIX_TARGET_NETWORK) - ? null : (Boolean)retrieveConfigurationEntryValueByKey(AST_ENTRY_USE_FIX_TARGET_NETWORK); + return !configurationContainsKey(USE_FIX_TARGET_NETWORK) + ? null : (Boolean)retrieveConfigurationEntryValueByKey(USE_FIX_TARGET_NETWORK); } public Integer getTargetNetworkUpdateInterval() { - return !configurationContainsKey(AST_ENTRY_TARGET_NETWORK_UPDATE_INTERVAL) - ? null : (Integer)retrieveConfigurationEntryValueByKey(AST_ENTRY_TARGET_NETWORK_UPDATE_INTERVAL); + return !configurationContainsKey(TARGET_NETWORK_UPDATE_INTERVAL) + ? null : (Integer)retrieveConfigurationEntryValueByKey(TARGET_NETWORK_UPDATE_INTERVAL); } public Integer getSnapshotInterval() { - return !configurationContainsKey(AST_ENTRY_SNAPSHOT_INTERVAL) - ? null : (Integer)retrieveConfigurationEntryValueByKey(AST_ENTRY_SNAPSHOT_INTERVAL); + return !configurationContainsKey(SNAPSHOT_INTERVAL) + ? null : (Integer)retrieveConfigurationEntryValueByKey(SNAPSHOT_INTERVAL); } public String getAgentName() { - return !configurationContainsKey(AST_ENTRY_AGENT_NAME) - ? null : (String)retrieveConfigurationEntryValueByKey(AST_ENTRY_AGENT_NAME); + return !configurationContainsKey(AGENT_NAME) + ? null : (String)retrieveConfigurationEntryValueByKey(AGENT_NAME); } public Boolean getUseDoubleDqn() { - return !configurationContainsKey(AST_ENTRY_USE_DOUBLE_DQN) - ? null : (Boolean)retrieveConfigurationEntryValueByKey(AST_ENTRY_USE_DOUBLE_DQN); + return !configurationContainsKey(USE_DOUBLE_DQN) + ? null : (Boolean)retrieveConfigurationEntryValueByKey(USE_DOUBLE_DQN); } public Double getSoftTargetUpdateRate() { - return !configurationContainsKey(AST_SOFT_TARGET_UPDATE_RATE) - ? null : (Double)retrieveConfigurationEntryValueByKey(AST_SOFT_TARGET_UPDATE_RATE); + return !configurationContainsKey(SOFT_TARGET_UPDATE_RATE) + ? null : (Double)retrieveConfigurationEntryValueByKey(SOFT_TARGET_UPDATE_RATE); } public Integer getStartTrainingAt() { - return !configurationContainsKey(AST_ENTRY_START_TRAINING_AT) - ? null : (Integer)retrieveConfigurationEntryValueByKey(AST_ENTRY_START_TRAINING_AT); + return !configurationContainsKey(START_TRAINING_AT) + ? null : (Integer)retrieveConfigurationEntryValueByKey(START_TRAINING_AT); } public Integer getEvaluationSamples() { - return !configurationContainsKey(AST_EVALUATION_SAMPLES) - ? null : (Integer)retrieveConfigurationEntryValueByKey(AST_EVALUATION_SAMPLES); + return !configurationContainsKey(EVALUATION_SAMPLES) + ? null : (Integer)retrieveConfigurationEntryValueByKey(EVALUATION_SAMPLES); + } + + public Double getPolicyNoise() { + return !configurationContainsKey(POLICY_NOISE) + ? null : (Double) retrieveConfigurationEntryValueByKey(POLICY_NOISE); } + public Double getNoiseClip() { + return !configurationContainsKey(NOISE_CLIP) + ? null : (Double) retrieveConfigurationEntryValueByKey(NOISE_CLIP); + } + public Integer getPolicyDelay() { + return !configurationContainsKey(POLICY_DELAY) + ? null : (Integer) retrieveConfigurationEntryValueByKey(POLICY_DELAY); + } public RLAlgorithm getRlAlgorithm() { if (!isReinforcementLearning()) { return null; } - return !configurationContainsKey(AST_ENTRY_RL_ALGORITHM) - ? RLAlgorithm.DQN : (RLAlgorithm)retrieveConfigurationEntryValueByKey(AST_ENTRY_RL_ALGORITHM); + return !configurationContainsKey(RL_ALGORITHM) + ? RLAlgorithm.DQN : (RLAlgorithm)retrieveConfigurationEntryValueByKey(RL_ALGORITHM); } public String getInputNameOfTrainedArchitecture() { if (!this.getConfiguration().getTrainedArchitecture().isPresent()) { throw new IllegalStateException("No trained architecture set"); } - TrainedArchitecture trainedArchitecture = getConfiguration().getTrainedArchitecture().get(); + NNArchitectureSymbol trainedArchitecture = getConfiguration().getTrainedArchitecture().get(); // We allow only one input, the first one is the only input return trainedArchitecture.getInputs().get(0); } @@ -141,7 +128,7 @@ public class ReinforcementConfigurationData extends ConfigurationData { if (!this.getConfiguration().getTrainedArchitecture().isPresent()) { throw new IllegalStateException("No trained architecture set"); } - TrainedArchitecture trainedArchitecture = getConfiguration().getTrainedArchitecture().get(); + NNArchitectureSymbol trainedArchitecture = getConfiguration().getTrainedArchitecture().get(); // We allow only one output, the first one is the only output return trainedArchitecture.getOutputs().get(0); } @@ -151,7 +138,7 @@ public class ReinforcementConfigurationData extends ConfigurationData { return null; } final String inputName = getInputNameOfTrainedArchitecture(); - TrainedArchitecture trainedArchitecture = this.getConfiguration().getTrainedArchitecture().get(); + NNArchitectureSymbol trainedArchitecture = this.getConfiguration().getTrainedArchitecture().get(); return trainedArchitecture.getDimensions().get(inputName); } @@ -160,53 +147,51 @@ public class ReinforcementConfigurationData extends ConfigurationData { return null; } final String outputName = getOutputNameOfTrainedArchitecture(); - TrainedArchitecture trainedArchitecture = this.getConfiguration().getTrainedArchitecture().get(); + NNArchitectureSymbol trainedArchitecture = this.getConfiguration().getTrainedArchitecture().get(); return trainedArchitecture.getDimensions().get(outputName); } public String getLoss() { - return !configurationContainsKey(AST_ENTRY_LOSS) - ? null : retrieveConfigurationEntryValueByKey(AST_ENTRY_LOSS).toString(); + return !configurationContainsKey(LOSS) + ? null : retrieveConfigurationEntryValueByKey(LOSS).toString(); } public Map<String, Object> getReplayMemory() { - return getMultiParamEntry(AST_ENTRY_REPLAY_MEMORY, "method"); + return getMultiParamEntry(REPLAY_MEMORY, "method"); } public Map<String, Object> getStrategy() { assert isReinforcementLearning(): "Strategy parameter only for reinforcement learning but called in a " + " non reinforcement learning context"; - Map<String, Object> strategyParams = getMultiParamEntry(AST_ENTRY_STRATEGY, "method"); - if (strategyParams.get("method").equals(STRATEGY_ORNSTEIN_UHLENBECK)) { - assert getConfiguration().getTrainedArchitecture().isPresent(): "Architecture not present," + - " but reinforcement training"; - TrainedArchitecture trainedArchitecture = getConfiguration().getTrainedArchitecture().get(); - final String actionPortName = getOutputNameOfTrainedArchitecture(); - Range actionRange = trainedArchitecture.getRanges().get(actionPortName); - - if (actionRange.isLowerLimitInfinity() && actionRange.isUpperLimitInfinity()) { - strategyParams.put("action_low", null); - strategyParams.put("action_high", null); - } else if(!actionRange.isLowerLimitInfinity() && actionRange.isUpperLimitInfinity()) { - assert actionRange.getLowerLimit().isPresent(); - strategyParams.put("action_low", actionRange.getLowerLimit().get()); - strategyParams.put("action_high", null); - } else if (actionRange.isLowerLimitInfinity() && !actionRange.isUpperLimitInfinity()) { - assert actionRange.getUpperLimit().isPresent(); - strategyParams.put("action_low", null); - strategyParams.put("action_high", actionRange.getUpperLimit().get()); - } else { - assert actionRange.getLowerLimit().isPresent(); - assert actionRange.getUpperLimit().isPresent(); - strategyParams.put("action_low", actionRange.getLowerLimit().get()); - strategyParams.put("action_high", actionRange.getUpperLimit().get()); - } + Map<String, Object> strategyParams = getMultiParamEntry(STRATEGY, "method"); + assert getConfiguration().getTrainedArchitecture().isPresent(): "Architecture not present," + + " but reinforcement training"; + NNArchitectureSymbol trainedArchitecture = getConfiguration().getTrainedArchitecture().get(); + final String actionPortName = getOutputNameOfTrainedArchitecture(); + Range actionRange = trainedArchitecture.getRanges().get(actionPortName); + + if (actionRange.isLowerLimitInfinity() && actionRange.isUpperLimitInfinity()) { + strategyParams.put("action_low", null); + strategyParams.put("action_high", null); + } else if(!actionRange.isLowerLimitInfinity() && actionRange.isUpperLimitInfinity()) { + assert actionRange.getLowerLimit().isPresent(); + strategyParams.put("action_low", actionRange.getLowerLimit().get()); + strategyParams.put("action_high", null); + } else if (actionRange.isLowerLimitInfinity() && !actionRange.isUpperLimitInfinity()) { + assert actionRange.getUpperLimit().isPresent(); + strategyParams.put("action_low", null); + strategyParams.put("action_high", actionRange.getUpperLimit().get()); + } else { + assert actionRange.getLowerLimit().isPresent(); + assert actionRange.getUpperLimit().isPresent(); + strategyParams.put("action_low", actionRange.getLowerLimit().get()); + strategyParams.put("action_high", actionRange.getUpperLimit().get()); } return strategyParams; } public Map<String, Object> getEnvironment() { - return getMultiParamEntry(AST_ENTRY_ENVIRONMENT, "environment"); + return getMultiParamEntry(ENVIRONMENT, "environment"); } public Boolean hasRewardFunction() { @@ -300,12 +285,12 @@ public class ReinforcementConfigurationData extends ConfigurationData { } public boolean hasRosRewardTopic() { - Map<String, Object> environmentParameters = getMultiParamEntry(AST_ENTRY_ENVIRONMENT, "environment"); + Map<String, Object> environmentParameters = getMultiParamEntry(ENVIRONMENT, "environment"); if (environmentParameters == null || !environmentParameters.containsKey("environment")) { return false; } - return environmentParameters.containsKey(ENVIRONMENT_PARAM_REWARD_TOPIC); + return environmentParameters.containsKey(ENVIRONMENT_REWARD_TOPIC); } private Map<String, Object> getMultiParamEntry(final String key, final String valueName) { diff --git a/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/annotations/ArchitectureAdapter.java b/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/annotations/ArchitectureAdapter.java index 8f2d08870ab70520018cecec947958be048ef6e1..e54b621927d0d86440970cb71a5c041d3b8a445b 100644 --- a/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/annotations/ArchitectureAdapter.java +++ b/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/annotations/ArchitectureAdapter.java @@ -2,8 +2,8 @@ package de.monticore.lang.monticar.cnnarch.gluongenerator.annotations; import de.monticore.lang.monticar.cnnarch._symboltable.ArchitectureSymbol; import de.monticore.lang.monticar.cnnarch._symboltable.IOSymbol; +import de.monticore.lang.monticar.cnntrain._symboltable.NNArchitectureSymbol; import de.monticore.lang.monticar.cnntrain.annotations.Range; -import de.monticore.lang.monticar.cnntrain.annotations.TrainedArchitecture; import de.monticore.lang.monticar.ranges._ast.ASTRange; import de.monticore.symboltable.CommonSymbol; @@ -14,11 +14,13 @@ import java.util.stream.Collectors; import static com.google.common.base.Preconditions.checkNotNull; -public class ArchitectureAdapter implements TrainedArchitecture { +public class ArchitectureAdapter extends NNArchitectureSymbol { private ArchitectureSymbol architectureSymbol; - public ArchitectureAdapter(final ArchitectureSymbol architectureSymbol) { + public ArchitectureAdapter(final String name, + final ArchitectureSymbol architectureSymbol) { + super(name); checkNotNull(architectureSymbol); this.architectureSymbol = architectureSymbol; } @@ -55,6 +57,10 @@ public class ArchitectureAdapter implements TrainedArchitecture { s -> s.getDefinition().getType().getDomain().getName())); } + public ArchitectureSymbol getArchitectureSymbol() { + return this.architectureSymbol; + } + private Range astRangeToTrainRange(final ASTRange range) { if (range == null || (range.hasNoLowerLimit() && range.hasNoUpperLimit())) { return Range.withInfinityLimits(); diff --git a/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/reinforcement/RewardFunctionSourceGenerator.java b/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/reinforcement/RewardFunctionSourceGenerator.java index b94e58d55e1b0734ee4803871d7cfb9d828ef3d9..5758ccff2e189604b7f2a702bddc4155de5913a8 100644 --- a/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/reinforcement/RewardFunctionSourceGenerator.java +++ b/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/reinforcement/RewardFunctionSourceGenerator.java @@ -1,8 +1,14 @@ package de.monticore.lang.monticar.cnnarch.gluongenerator.reinforcement; +import de.monticore.lang.embeddedmontiarc.embeddedmontiarc._symboltable.instanceStructure.EMAComponentInstanceSymbol; +import de.monticore.lang.tagging._symboltable.TaggingResolver; + /** * */ public interface RewardFunctionSourceGenerator { - void generate(String modelPath, String qualifiedName, String targetPath); + TaggingResolver createTaggingResolver(String modelPath); + EMAComponentInstanceSymbol resolveSymbol(TaggingResolver taggingResolver, String rootModel); + void generate(String modelPath, String rootModel, String targetPath); + void generate(EMAComponentInstanceSymbol componentInstanceSymbol, TaggingResolver taggingResolver, String targetPath); } \ No newline at end of file diff --git a/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/reinforcement/critic/CriticNetworkGenerationException.java b/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/reinforcement/critic/CriticNetworkGenerationException.java deleted file mode 100644 index 70389b6fe81945552c177fa614315446a870a438..0000000000000000000000000000000000000000 --- a/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/reinforcement/critic/CriticNetworkGenerationException.java +++ /dev/null @@ -1,7 +0,0 @@ -package de.monticore.lang.monticar.cnnarch.gluongenerator.reinforcement.critic; - -public class CriticNetworkGenerationException extends RuntimeException { - public CriticNetworkGenerationException(String s) { - super("Generation of critic network failed: " + s); - } -} diff --git a/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/reinforcement/critic/CriticNetworkGenerationPair.java b/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/reinforcement/critic/CriticNetworkGenerationPair.java deleted file mode 100644 index 8ea2d62fb68d1f356e7b357d9ed71e1f2fed9ead..0000000000000000000000000000000000000000 --- a/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/reinforcement/critic/CriticNetworkGenerationPair.java +++ /dev/null @@ -1,21 +0,0 @@ -package de.monticore.lang.monticar.cnnarch.gluongenerator.reinforcement.critic; - -import java.util.Map; - -public class CriticNetworkGenerationPair { - private String criticNetworkName; - private Map<String, String> fileContent; - - public CriticNetworkGenerationPair(String criticNetworkName, Map<String, String> fileContent) { - this.criticNetworkName = criticNetworkName; - this.fileContent = fileContent; - } - - public String getCriticNetworkName() { - return criticNetworkName; - } - - public Map<String, String> getFileContent() { - return fileContent; - } -} diff --git a/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/reinforcement/critic/CriticNetworkGenerator.java b/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/reinforcement/critic/CriticNetworkGenerator.java deleted file mode 100644 index 5040512ea37912f5e70fea05d83a2f4ce85f1b08..0000000000000000000000000000000000000000 --- a/src/main/java/de/monticore/lang/monticar/cnnarch/gluongenerator/reinforcement/critic/CriticNetworkGenerator.java +++ /dev/null @@ -1,213 +0,0 @@ -package de.monticore.lang.monticar.cnnarch.gluongenerator.reinforcement.critic; - -import com.google.common.collect.Lists; -import de.monticore.lang.monticar.cnnarch._symboltable.*; -import de.monticore.lang.monticar.cnnarch.gluongenerator.CNNArch2Gluon; -import de.monticore.lang.monticar.cnnarch.gluongenerator.CNNArch2GluonArchitectureSupportChecker; -import de.monticore.lang.monticar.cnnarch.gluongenerator.CNNArch2GluonLayerSupportChecker; -import de.monticore.lang.monticar.cnnarch.generator.CNNArchSymbolCompiler; -import de.monticore.lang.monticar.cnnarch.generator.TemplateConfiguration; -import de.monticore.lang.monticar.cnntrain._symboltable.ConfigurationSymbol; -import de.monticore.lang.monticar.cnntrain.annotations.Range; -import de.monticore.lang.monticar.cnntrain.annotations.TrainedArchitecture; -import de.se_rwth.commons.logging.Log; -import org.apache.commons.io.FileUtils; -import org.apache.commons.lang3.StringUtils; - -import java.io.IOException; -import java.nio.charset.Charset; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.*; - -import static com.google.common.base.Preconditions.checkState; - -public class CriticNetworkGenerator { - private static final String START_SEQUENCE = "implementationCritic(state,action)"; - - private String generationTargetPath; - private String rootModelsDir; - - protected String getGenerationTargetPath() { - return generationTargetPath; - } - - public void setGenerationTargetPath(String getGenerationTargetPath) { - this.generationTargetPath = getGenerationTargetPath; - } - - protected String getRootModelsDir() { - return rootModelsDir; - } - - public void setRootModelsDir(String rootModelsDir) { - this.rootModelsDir = rootModelsDir; - } - - public CriticNetworkGenerationPair generateCriticNetworkContent(TemplateConfiguration templateConfiguration, - ConfigurationSymbol configurationSymbol) { - checkState(getRootModelsDir() != null, "Root project directory is not set"); - checkState(getGenerationTargetPath() != null, "Target path is not set"); - - failIfArchitectureNotAvailable(configurationSymbol); - assert configurationSymbol.getTrainedArchitecture().isPresent(); - TrainedArchitecture trainedArchitecture = configurationSymbol.getTrainedArchitecture().get(); - failIfActorHasMultipleIO(trainedArchitecture); - - List<String> criticNetwork = retrieveFullNameOfCriticsNetworkFromConfiguration(configurationSymbol); - final String criticNetworkName = criticNetwork.get(criticNetwork.size()-1); - final Path pathTocriticNetworkFile = retrievePathToCriticNetworkFileFromFullName(criticNetwork, criticNetworkName); - final String networkImplementation = parseNetworkImplementationFromFile(pathTocriticNetworkFile); - - Map<String, Object> context = makeTemplateContextMap(trainedArchitecture, criticNetworkName, networkImplementation); - Path directoryOfCnnArchFile = makeCnnArchFileFromContext(templateConfiguration, context); - - Map<String, String> fileContentMap = generatePythonNetworkFiles(criticNetworkName, directoryOfCnnArchFile); - deleteOutputDirectory(directoryOfCnnArchFile); - - return new CriticNetworkGenerationPair(criticNetworkName, fileContentMap); - } - - private void deleteOutputDirectory(Path directoryOfCnnArchFile) { - try { - FileUtils.deleteDirectory(directoryOfCnnArchFile.toFile()); - } catch (IOException e) { - Log.warn("Cannot delete temporary CNN arch directory: " + directoryOfCnnArchFile.toString()); - } - } - - private Map<String, String> generatePythonNetworkFiles(String criticNetworkName, Path directoryOfCnnArchFile) { - CNNArch2Gluon gluonGenerator = new CNNArch2Gluon(); - gluonGenerator.setGenerationTargetPath(this.getGenerationTargetPath()); - - Map<String, String> fileContentMap = new HashMap<>(); - CNNArchSymbolCompiler symbolCompiler = new CNNArchSymbolCompiler(new CNNArch2GluonArchitectureSupportChecker(), - new CNNArch2GluonLayerSupportChecker()); - ArchitectureSymbol architectureSymbol = symbolCompiler.compileArchitectureSymbolFromModelsDir(directoryOfCnnArchFile, criticNetworkName); - architectureSymbol.setComponentName(criticNetworkName); - fileContentMap.putAll(gluonGenerator.generateStringsAllowMultipleIO(architectureSymbol, true)); - return fileContentMap; - } - - private Path makeCnnArchFileFromContext(TemplateConfiguration templateConfiguration, Map<String, Object> context) { - final String architectureContent = templateConfiguration.processTemplate( - context, "reinforcement/architecture/CriticArchitecture.ftl"); - - Path tmpDirectoryToArchFile = Paths.get(this.getGenerationTargetPath(), "tmp"); - try { - if (!tmpDirectoryToArchFile.toFile().exists()) { - Files.createDirectories(tmpDirectoryToArchFile); - } - Files.write( - Paths.get(tmpDirectoryToArchFile.toString(), context.get("architectureName") + ".cnna"), architectureContent.getBytes()); - } catch (IOException e) { - failWithMessage(e.getMessage()); - } - return tmpDirectoryToArchFile; - } - - private Map<String, Object> makeTemplateContextMap(TrainedArchitecture trainedArchitecture, String criticNetworkName, String networkImplementation) { - final String stateName = trainedArchitecture.getInputs().get(0); - final String actionName = trainedArchitecture.getOutputs().get(0); - - Map<String, List<Integer>> dimensions = trainedArchitecture.getDimensions(); - List<Integer> stateDimensions = dimensions.get(stateName); - List<Integer> actionDimensions = dimensions.get(actionName); - - Map<String, Range> ranges = trainedArchitecture.getRanges(); - Range stateRange = ranges.get(stateName); - Range actionRange = ranges.get(actionName); - - Map<String, String> types = trainedArchitecture.getTypes(); - final String stateType = types.get(stateName); - final String actionType = types.get(actionName); - - Map<String, Object> context = new HashMap<>(); - context.put("stateDimension", stateDimensions); - context.put("actionDimension", actionDimensions); - context.put("stateRange", stateRange); - context.put("actionRange", actionRange); - context.put("stateType", stateType); - context.put("actionType", actionType); - context.put("implementation", networkImplementation); - context.put("architectureName", criticNetworkName); - return context; - } - - private String parseNetworkImplementationFromFile(Path criticNetworkFile) { - String criticNetworkFileContent = null; - try { - criticNetworkFileContent = new String(Files.readAllBytes(criticNetworkFile), Charset.forName("UTF-8")); - } catch (IOException e) { - failWithMessage("Cannot create critic network file:" + e.getMessage()); - } - - String contentWhiteSpaceRemoved = criticNetworkFileContent.replaceAll("\\s+",""); - - if (!contentWhiteSpaceRemoved.contains(START_SEQUENCE) - || StringUtils.countMatches(contentWhiteSpaceRemoved, "{") != 1 - || StringUtils.countMatches(contentWhiteSpaceRemoved, "}") != 1 - || contentWhiteSpaceRemoved.charAt(contentWhiteSpaceRemoved.length() - 1) != '}') { - failWithMessage("Cannot parse critic file"); - } - - final int startOfNNImplementation = contentWhiteSpaceRemoved.indexOf("{") + 1; - final int endOfNNImplementation = contentWhiteSpaceRemoved.indexOf("}"); - return contentWhiteSpaceRemoved.substring(startOfNNImplementation, endOfNNImplementation); - } - - private Path retrievePathToCriticNetworkFileFromFullName(List<String> criticNetwork, String criticNetworkName) { - // Add file ending cnna to file name - criticNetwork.set(criticNetwork.size()-1, criticNetworkName + ".cnna"); - - Path root = Paths.get(this.getRootModelsDir()); - Path criticNetworkFile = criticNetwork.stream().map(Paths::get).reduce(root, Path::resolve); - - if (!criticNetworkFile.toFile().exists()) { - failWithMessage("Critic network file does not exist in " + criticNetworkFile.toString()); - } - return criticNetworkFile; - } - - private List<String> retrieveFullNameOfCriticsNetworkFromConfiguration(ConfigurationSymbol configurationSymbol) { - // Load critics network - failIfConfigurationHasNoCritic(configurationSymbol); - - assert configurationSymbol.getEntry("critic").getValue().getValue() instanceof String; - List<String> criticNetwork = Lists.newArrayList(( - (String)configurationSymbol.getEntry("critic").getValue().getValue()).split("\\.")); - - // Check if file name is upper case otherwise make it upper case - int size = criticNetwork.size(); - if (Character.isLowerCase(criticNetwork.get(size-1).charAt(0))) { - String lowerCaseFileName = criticNetwork.get(size-1); - String upperCaseFileName = lowerCaseFileName.substring(0,1).toUpperCase() + lowerCaseFileName.substring(1); - criticNetwork.set(size-1, upperCaseFileName); - } - return criticNetwork; - } - - private void failIfConfigurationHasNoCritic(ConfigurationSymbol configurationSymbol) { - if (!configurationSymbol.getEntryMap().containsKey("critic")) { - failWithMessage("No critic network file given, but is required for selected algorithm"); - } - } - - private void failIfActorHasMultipleIO(TrainedArchitecture trainedArchitecture) { - if (trainedArchitecture.getInputs().size() > 1 || trainedArchitecture.getOutputs().size() > 1) { - failWithMessage("Actor component with multiple inputs or outputs is not supported by this generator"); - } - } - - private void failIfArchitectureNotAvailable(ConfigurationSymbol configurationSymbol) { - if (!configurationSymbol.getTrainedArchitecture().isPresent()) { - failWithMessage("No architecture symbol found but is required for selected algorithm"); - } - } - - private void failWithMessage(final String message) { - Log.error("Critic network generation failed: " + message); - throw new CriticNetworkGenerationException(message); - } -} \ No newline at end of file diff --git a/src/main/resources/templates/gluon/reinforcement/Trainer.ftl b/src/main/resources/templates/gluon/reinforcement/Trainer.ftl index c2e707afde10987362d29dc959c1474b0291d373..99cca6efc44d4f2e23f207126dd54d1c6476a214 100644 --- a/src/main/resources/templates/gluon/reinforcement/Trainer.ftl +++ b/src/main/resources/templates/gluon/reinforcement/Trainer.ftl @@ -1,10 +1,10 @@ <#setting number_format="computer"> <#assign config = configurations[0]> -<#assign rlAgentType=config.rlAlgorithm?switch("dqn", "DqnAgent", "ddpg", "DdpgAgent")> +<#assign rlAgentType=config.rlAlgorithm?switch("dqn", "DqnAgent", "ddpg", "DdpgAgent", "td3", "TwinDelayedDdpgAgent")> from ${rlFrameworkModule}.agent import ${rlAgentType} from ${rlFrameworkModule}.util import AgentSignalHandler from ${rlFrameworkModule}.cnnarch_logger import ArchLogger -<#if config.rlAlgorithm=="ddpg"> +<#if config.rlAlgorithm=="ddpg" || config.rlAlgorithm=="td3"> from ${rlFrameworkModule}.CNNCreator_${criticInstanceName} import CNNCreator_${criticInstanceName} </#if> import ${rlFrameworkModule}.environment @@ -137,8 +137,10 @@ if __name__ == "__main__": </#if> <#if (config.rlAlgorithm == "dqn")> <#include "params/DqnAgentParams.ftl"> -<#else> +<#elseif config.rlAlgorithm == "ddpg"> <#include "params/DdpgAgentParams.ftl"> +<#else> +<#include "params/Td3AgentParams.ftl"> </#if> } @@ -168,7 +170,7 @@ if __name__ == "__main__": if train_successful: <#if (config.rlAlgorithm == "dqn")> - agent.save_best_network(qnet_creator._model_dir_ + qnet_creator._model_prefix_ + '_0_newest', epoch=0) + agent.export_best_network(path=qnet_creator._model_dir_ + qnet_creator._model_prefix_ + '_0_newest', epoch=0) <#else> - agent.save_best_network(actor_creator._model_dir_ + actor_creator._model_prefix_ + '_0_newest', epoch=0) + agent.export_best_network(path=actor_creator._model_dir_ + actor_creator._model_prefix_ + '_0_newest', epoch=0) </#if> \ No newline at end of file diff --git a/src/main/resources/templates/gluon/reinforcement/agent/Agent.ftl b/src/main/resources/templates/gluon/reinforcement/agent/Agent.ftl index 6dbf9ef17065c25832844649b1c3bb16009f4e6f..148c89848b8d990b5f635c3976fd15e71a608893 100644 --- a/src/main/resources/templates/gluon/reinforcement/agent/Agent.ftl +++ b/src/main/resources/templates/gluon/reinforcement/agent/Agent.ftl @@ -124,10 +124,10 @@ class Agent(object): def _make_pickle_ready(self, session_dir): del self._training_stats.logger - self._logger = None self._environment.close() self._environment = None - self._save_net(self._best_net, 'best_net', session_dir) + self._export_net(self._best_net, 'best_net', filedir=session_dir) + self._logger = None self._best_net = None def _make_config_dict(self): @@ -258,26 +258,22 @@ class Agent(object): return self._target_score is not None\ and avg_reward > self._target_score - def _save_parameters(self, net, episode=None, filename='dqn-agent-params'): + def _export_net(self, net, filename, filedir=None, episode=None): assert self._output_directory assert isinstance(net, gluon.HybridBlock) make_directory_if_not_exist(self._output_directory) + filedir = self._output_directory if filedir is None else filedir + filename = os.path.join(filedir, filename) - if(episode is not None): - self._logger.info( - 'Saving model parameters after episode %d' % episode) + if episode is not None: filename = filename + '-ep{}'.format(episode) - else: - self._logger.info('Saving model parameters') - self._save_net(net, filename) - def _save_net(self, net, filename, filedir=None): - filedir = self._output_directory if filedir is None else filedir - filename = os.path.join(filedir, filename) - net.save_parameters(filename + '.params') net.export(filename, epoch=0) + net.save_parameters(filename + '.params') - def save_best_network(self, path, epoch=0): + def export_best_network(self, path=None, epoch=0): + path = os.path.join(self._output_directory, 'best_network')\ + if path is None else path self._logger.info( 'Saving best network with average reward of {}'.format( self._best_avg_score)) @@ -373,15 +369,17 @@ class DdpgAgent(Agent): def _make_pickle_ready(self, session_dir): super(DdpgAgent, self)._make_pickle_ready(session_dir) - self._save_net(self._actor, 'current_actor') + self._export_net(self._actor, 'current_actor') - self._save_net(self._actor, 'actor', session_dir) + self._export_net(self._actor, 'actor', filedir=session_dir) self._actor = None - self._save_net(self._critic, 'critic', session_dir) + self._export_net(self._critic, 'critic', filedir=session_dir) self._critic = None - self._save_net(self._actor_target, 'actor_target', session_dir) + self._export_net( + self._actor_target, 'actor_target', filedir=session_dir) self._actor_target = None - self._save_net(self._critic_target, 'critic_target', session_dir) + self._export_net( + self._critic_target, 'critic_target', filedir=session_dir) self._critic_target = None @classmethod @@ -449,7 +447,8 @@ class DdpgAgent(Agent): return action[0].asnumpy() def save_parameters(self, episode): - self._save_parameters(self._actor, episode=episode) + self._export_net( + self._actor, self._agent_name + '_actor', episode=episode) def train(self, episodes=None): self.save_config_file() @@ -504,6 +503,7 @@ class DdpgAgent(Agent): # actor and exploration noise N according to strategy action = self._strategy.select_action( self.get_next_action(state)) + self._strategy.decay(self._current_episode) # Execute action a and observe reward r and next state ns next_state, reward, terminal, _ = \ @@ -545,10 +545,10 @@ class DdpgAgent(Agent): # with critic parameters tmp_critic = self._copy_critic() with autograd.record(): - actor_qvalues = tmp_critic(states, self._actor(states)) # For maximizing qvalues we have to multiply with -1 # as we use a minimizer - actor_loss = -1 * actor_qvalues.mean() + actor_loss = -tmp_critic( + states, self._actor(states)).mean() actor_loss.backward() trainer_actor.step(self._minibatch_size) @@ -566,7 +566,7 @@ class DdpgAgent(Agent): episode_actor_loss +=\ np.sum(actor_loss.asnumpy()) / self._minibatch_size episode_avg_q_value +=\ - np.sum(actor_qvalues.asnumpy()) / self._minibatch_size + np.sum(qvalues.asnumpy()) / self._minibatch_size training_steps += 1 @@ -594,7 +594,6 @@ class DdpgAgent(Agent): self._strategy.cur_eps, episode_reward) self._do_snapshot_if_in_interval(self._current_episode) - self._strategy.decay(self._current_episode) if self._is_target_reached(avg_reward): self._logger.info( @@ -605,7 +604,7 @@ class DdpgAgent(Agent): self._evaluate() self.save_parameters(episode=self._current_episode) - self.save_best_network(os.path.join(self._output_directory, 'best')) + self.export_best_network() self._training_stats.save_stats(self._output_directory) self._logger.info('--------- Training finished ---------') return True @@ -641,6 +640,359 @@ class DdpgAgent(Agent): self._critic, self._state_dim, self._action_dim, ctx=self._ctx) +class TwinDelayedDdpgAgent(DdpgAgent): + def __init__( + self, + actor, + critic, + environment, + replay_memory_params, + strategy_params, + state_dim, + action_dim, + soft_target_update_rate=.001, + actor_optimizer='adam', + actor_optimizer_params={'learning_rate': 0.0001}, + critic_optimizer='adam', + critic_optimizer_params={'learning_rate': 0.001}, + ctx=None, + discount_factor=.9, + training_episodes=50, + start_training=20, + train_interval=1, + snapshot_interval=200, + agent_name='DdpgAgent', + max_episode_step=9999, + evaluation_samples=100, + output_directory='model_parameters', + verbose=True, + target_score=None, + policy_noise=0.2, + noise_clip=0.5, + policy_delay=2 + ): + super(TwinDelayedDdpgAgent, self).__init__( + environment=environment, replay_memory_params=replay_memory_params, + strategy_params=strategy_params, state_dim=state_dim, + action_dim=action_dim, ctx=ctx, discount_factor=discount_factor, + training_episodes=training_episodes, start_training=start_training, + train_interval=train_interval, + snapshot_interval=snapshot_interval, agent_name=agent_name, + max_episode_step=max_episode_step, + output_directory=output_directory, verbose=verbose, + target_score=target_score, evaluation_samples=evaluation_samples, + critic=critic, soft_target_update_rate=soft_target_update_rate, + actor=actor, actor_optimizer=actor_optimizer, + actor_optimizer_params=actor_optimizer_params, + critic_optimizer=critic_optimizer, + critic_optimizer_params=critic_optimizer_params) + + self._policy_noise = policy_noise + self._noise_clip = noise_clip + self._policy_delay = policy_delay + + self._critic2 = self._critic.__class__() + self._critic2.collect_params().initialize( + mx.init.Normal(), ctx=self._ctx) + self._critic2.hybridize() + self._critic2(nd.ones((1,) + state_dim, ctx=self._ctx), + nd.ones((1,) + action_dim, ctx=self._ctx)) + + self._critic2_target = self._copy_critic2() + + self._critic2_optimizer = critic_optimizer + self._critic2_optimizer_params = self._adjust_optimizer_params( + critic_optimizer_params) + + def _make_pickle_ready(self, session_dir): + super(TwinDelayedDdpgAgent, self)._make_pickle_ready(session_dir) + self._export_net(self._critic2, 'critic2', filedir=session_dir) + self._critic2 = None + self._export_net( + self._critic2_target, 'critic2_target', filedir=session_dir) + self._critic2_target = None + + @classmethod + def resume_from_session(cls, session_dir, actor, critic, environment): + import pickle + if not os.path.exists(session_dir): + raise ValueError('Session directory does not exist') + + files = dict() + files['agent'] = os.path.join(session_dir, 'agent.p') + files['best_net_params'] = os.path.join(session_dir, 'best_net.params') + files['actor_net_params'] = os.path.join(session_dir, 'actor.params') + files['actor_target_net_params'] = os.path.join( + session_dir, 'actor_target.params') + files['critic_net_params'] = os.path.join(session_dir, 'critic.params') + files['critic_target_net_params'] = os.path.join( + session_dir, 'critic_target.params') + files['critic2_net_params'] = os.path.join( + session_dir, 'critic2.params') + files['critic2_target_net_params'] = os.path.join( + session_dir, 'critic2_target.params') + + for file in files.values(): + if not os.path.exists(file): + raise ValueError( + 'Session directory is not complete: {} is missing' + .format(file)) + + with open(files['agent'], 'rb') as f: + agent = pickle.load(f) + + agent._environment = environment + + agent._actor = actor + agent._actor.load_parameters(files['actor_net_params'], agent._ctx) + agent._actor.hybridize() + agent._actor(nd.random_normal( + shape=((1,) + agent._state_dim), ctx=agent._ctx)) + + agent._best_net = copy_net(agent._actor, agent._state_dim, agent._ctx) + agent._best_net.load_parameters(files['best_net_params'], agent._ctx) + + agent._actor_target = copy_net( + agent._actor, agent._state_dim, agent._ctx) + agent._actor_target.load_parameters(files['actor_target_net_params']) + + agent._critic = critic + agent._critic.load_parameters(files['critic_net_params'], agent._ctx) + agent._critic.hybridize() + agent._critic( + nd.random_normal(shape=((1,) + agent._state_dim), ctx=agent._ctx), + nd.random_normal(shape=((1,) + agent._action_dim), ctx=agent._ctx)) + + agent._critic_target = copy_net_with_two_inputs( + agent._critic, agent._state_dim, agent._action_dim, agent._ctx) + agent._critic_target.load_parameters(files['critic_target_net_params']) + + agent._critic2 = copy_net_with_two_inputs( + agent._critic, agent._state_dim, agent._action_dim, agent._ctx) + agent._critic2.load_parameters(files['critic2_net_params'], agent._ctx) + agent._critic2.hybridize() + agent._critic2( + nd.random_normal(shape=((1,) + agent._state_dim), ctx=agent._ctx), + nd.random_normal(shape=((1,) + agent._action_dim), ctx=agent._ctx)) + + agent._critic2_target = copy_net_with_two_inputs( + agent._critic2, agent._state_dim, agent._action_dim, agent._ctx) + agent._critic2_target.load_parameters( + files['critic2_target_net_params']) + + agent._logger = ArchLogger.get_logger() + agent._training_stats.logger = ArchLogger.get_logger() + agent._logger.info('Agent was retrieved; Training can be continued') + + return agent + + def _copy_critic2(self): + assert self._critic2 is not None + assert self._ctx is not None + assert type(self._state_dim) is tuple + assert type(self._action_dim) is tuple + return copy_net_with_two_inputs( + self._critic2, self._state_dim, self._action_dim, ctx=self._ctx) + + def train(self, episodes=None): + self.save_config_file() + self._logger.info("--- Start TwinDelayedDDPG training ---") + episodes = \ + episodes if episodes is not None else self._training_episodes + + resume = (self._current_episode > 0) + if resume: + self._logger.info("Training session resumed") + self._logger.info( + "Starting from episode {}".format(self._current_episode)) + else: + self._training_stats = DdpgTrainingStats(episodes) + + # Initialize target Q1' and Q2' and mu' + self._actor_target = self._copy_actor() + self._critic_target = self._copy_critic() + self._critic2_target = self._copy_critic2() + + # Initialize l2 loss for critic network + l2_loss = gluon.loss.L2Loss() + + # Initialize critic and actor trainer + trainer_actor = gluon.Trainer( + self._actor.collect_params(), self._actor_optimizer, + self._actor_optimizer_params) + trainer_critic = gluon.Trainer( + self._critic.collect_params(), self._critic_optimizer, + self._critic_optimizer_params) + trainer_critic2 = gluon.Trainer( + self._critic2.collect_params(), self._critic2_optimizer, + self._critic2_optimizer_params) + + # For episode=1..n + while self._current_episode < episodes: + # Check interrupt flag + if self._check_interrupt_routine(): + return False + + # Initialize new episode + step = 0 + episode_reward = 0 + start = time.time() + episode_critic_loss = 0 + episode_actor_loss = 0 + episode_avg_q_value = 0 + training_steps = 0 + actor_updates = 0 + + # Get initialial observation state s + state = self._environment.reset() + + # For step=1..T + while step < self._max_episode_step: + # Select an action a = mu(s) + N(step) according to current + # actor and exploration noise N according to strategy + action = self._strategy.select_action( + self.get_next_action(state)) + self._strategy.decay(self._current_episode) + + # Execute action a and observe reward r and next state ns + next_state, reward, terminal, _ = \ + self._environment.step(action) + + self._logger.debug( + 'Applied action {} with reward {}'.format(action, reward)) + + # Store transition (s,a,r,ns) in replay buffer + self._memory.append( + state, action, reward, next_state, terminal) + + if self._do_training(): + # Sample random minibatch of b transitions + # (s_i, a_i, r_i, s_(i+1)) from replay buffer + states, actions, rewards, next_states, terminals =\ + self._sample_from_memory() + + clipped_noise = nd.array( + np.clip( + np.random.normal( + loc=0, scale=self._policy_noise, + size=self._minibatch_size + ).reshape(self._minibatch_size, 1), + -self._noise_clip, + self._noise_clip + ), + ctx=self._ctx + ) + target_action = np.clip( + self._actor_target(next_states) + clipped_noise, + self._strategy._action_low, + self._strategy._action_high) + + rewards = rewards.reshape(self._minibatch_size, 1) + terminals = terminals.reshape(self._minibatch_size, 1) + + target_qvalues1 = self._critic_target(next_states, + target_action) + target_qvalues2 = self._critic2_target(next_states, + target_action) + target_qvalues = nd.minimum(target_qvalues1, + target_qvalues2) + y = rewards + (1 - terminals) * self._discount_factor\ + * target_qvalues + + with autograd.record(): + qvalues1 = self._critic(states, actions) + critic1_loss = l2_loss(qvalues1, y) + critic1_loss.backward() + trainer_critic.step(self._minibatch_size) + + with autograd.record(): + qvalues2 = self._critic2(states, actions) + critic2_loss = l2_loss(qvalues2, y) + critic2_loss.backward() + trainer_critic2.step(self._minibatch_size) + + critic_loss = (critic1_loss.mean() + critic2_loss.mean())/2 + + if self._total_steps % self._policy_delay == 0: + tmp_critic = self._copy_critic() + with autograd.record(): + actor_loss = -tmp_critic( + states, self._actor(states)).mean() + actor_loss.backward() + trainer_actor.step(self._minibatch_size) + + # Update target networks: + self._actor_target = self._soft_update( + self._actor, self._actor_target, + self._soft_target_update_rate) + self._critic_target = self._soft_update( + self._critic, self._critic_target, + self._soft_target_update_rate) + self._critic2_target = self._soft_update( + self._critic2, self._critic2_target, + self._soft_target_update_rate) + + actor_updates = actor_updates + 1 + else: + actor_loss = nd.array([0], ctx=self._ctx) + + # Update statistics + episode_critic_loss +=\ + np.sum(critic_loss.asnumpy()) / self._minibatch_size + episode_actor_loss += 0 if actor_updates == 0 else\ + np.sum(actor_loss.asnumpy()[0]) + episode_avg_q_value +=\ + np.sum(target_qvalues.asnumpy()) / self._minibatch_size + + training_steps += 1 + + episode_reward += reward + step += 1 + self._total_steps += 1 + state = next_state + + if terminal: + # Reset the strategy + self._strategy.reset() + break + + # Log the episode results + episode_actor_loss = 0 if actor_updates == 0\ + else (episode_actor_loss / actor_updates) + episode_critic_loss = 0 if training_steps == 0\ + else (episode_critic_loss / training_steps) + episode_avg_q_value = 0 if actor_updates == 0\ + else (episode_avg_q_value / training_steps) + + avg_reward = self._training_stats.log_episode( + self._current_episode, start, training_steps, + episode_actor_loss, episode_critic_loss, episode_avg_q_value, + self._strategy.cur_eps, episode_reward) + + self._do_snapshot_if_in_interval(self._current_episode) + + if self._is_target_reached(avg_reward): + self._logger.info( + 'Target score is reached in average; Training is stopped') + break + + self._current_episode += 1 + + self._evaluate() + self.save_parameters(episode=self._current_episode) + self.export_best_network() + self._training_stats.save_stats(self._output_directory) + self._logger.info('--------- Training finished ---------') + return True + + def _make_config_dict(self): + config = super(TwinDelayedDdpgAgent, self)._make_config_dict() + config['policy_noise'] = self._policy_noise + config['noise_clip'] = self._noise_clip + config['policy_delay'] = self._policy_delay + return config + + class DqnAgent(Agent): def __init__( self, @@ -652,7 +1004,7 @@ class DqnAgent(Agent): action_dim, ctx=None, discount_factor=.9, - loss_function='euclidean', + loss_function='l2', optimizer='rmsprop', optimizer_params={'learning_rate': 0.09}, training_episodes=50, @@ -691,6 +1043,14 @@ class DqnAgent(Agent): self._double_dqn = double_dqn self._use_fix_target = use_fix_target + # Build memory buffer for discrete actions + replay_memory_params['state_dim'] = state_dim + replay_memory_params['action_dim'] = (1,) + self._replay_memory_params = replay_memory_params + rm_builder = ReplayMemoryBuilder() + self._memory = rm_builder.build_by_params(**replay_memory_params) + self._minibatch_size = self._memory.sample_size + # Initialize best network self._best_net = copy_net(self._qnet, self._state_dim, self._ctx) self._best_avg_score = -np.infty @@ -740,10 +1100,10 @@ class DqnAgent(Agent): def _make_pickle_ready(self, session_dir): super(DqnAgent, self)._make_pickle_ready(session_dir) - self._save_net(self._qnet, 'current_qnet') - self._save_net(self._qnet, 'qnet', session_dir) + self._export_net(self._qnet, 'current_qnet') + self._export_net(self._qnet, 'qnet', filedir=session_dir) self._qnet = None - self._save_net(self._target_qnet, 'target_net', session_dir) + self._export_net(self._target_qnet, 'target_net', filedir=session_dir) self._target_qnet = None def get_q_values(self, state, with_best=False): @@ -839,6 +1199,7 @@ class DqnAgent(Agent): # 1. Choose an action based on current game state and policy q_values = self._qnet(nd.array([state], ctx=self._ctx)) action = self._strategy.select_action(q_values[0]) + self._strategy.decay(self._current_episode) # 2. Play the game for a single step next_state, reward, terminal, _ =\ @@ -846,7 +1207,7 @@ class DqnAgent(Agent): # 3. Store transition in replay memory self._memory.append( - state, action, reward, next_state, terminal) + state, [action], reward, next_state, terminal) # 4. Train the network if in interval if self._do_training(): @@ -875,8 +1236,6 @@ class DqnAgent(Agent): self._current_episode, start, training_steps, episode_loss, self._strategy.cur_eps, episode_reward) - self._strategy.decay(self._current_episode) - if self._is_target_reached(avg_reward): self._logger.info( 'Target score is reached in average; Training is stopped') @@ -886,7 +1245,7 @@ class DqnAgent(Agent): self._evaluate() self.save_parameters(episode=self._current_episode) - self.save_best_network(os.path.join(self._output_directory, 'best')) + self.export_best_network() self._training_stats.save_stats(self._output_directory) self._logger.info('--------- Training finished ---------') return True @@ -902,7 +1261,8 @@ class DqnAgent(Agent): return config def save_parameters(self, episode): - self._save_parameters(self._qnet, episode=episode) + self._export_net( + self._qnet, self._agent_name + '_qnet', episode=episode) def _save_current_as_best_net(self): self._best_net = copy_net( diff --git a/src/main/resources/templates/gluon/reinforcement/agent/Strategy.ftl b/src/main/resources/templates/gluon/reinforcement/agent/Strategy.ftl index 8e4c279e5569d5b1505ee4fb9f6e9136a40c30bb..d2d2386658c56c5ebb6d654468669a2664622305 100644 --- a/src/main/resources/templates/gluon/reinforcement/agent/Strategy.ftl +++ b/src/main/resources/templates/gluon/reinforcement/agent/Strategy.ftl @@ -13,18 +13,21 @@ class StrategyBuilder(object): epsilon_decay_method='no', epsilon_decay=0.0, epsilon_decay_start=0, + epsilon_decay_per_step=False, action_dim=None, action_low=None, action_high=None, mu=0.0, theta=0.5, - sigma=0.3 + sigma=0.3, + noise_variance=0.1 ): if epsilon_decay_method == 'linear': decay = LinearDecay( eps_decay=epsilon_decay, min_eps=min_epsilon, - decay_start=epsilon_decay_start) + decay_start=epsilon_decay_start, + decay_per_step=epsilon_decay_per_step) else: decay = NoDecay() @@ -44,6 +47,13 @@ class StrategyBuilder(object): return OrnsteinUhlenbeckStrategy( action_dim, action_low, action_high, epsilon, mu, theta, sigma, decay) + elif method == 'gaussian': + assert action_dim is not None + assert action_low is not None + assert action_high is not None + assert noise_variance is not None + return GaussianNoiseStrategy(action_dim, action_low, action_high, + epsilon, noise_variance, decay) else: assert action_dim is not None assert len(action_dim) == 1 @@ -70,17 +80,27 @@ class NoDecay(BaseDecay): class LinearDecay(BaseDecay): - def __init__(self, eps_decay, min_eps=0, decay_start=0): + def __init__(self, eps_decay, min_eps=0, decay_start=0, decay_per_step=False): super(LinearDecay, self).__init__() self.eps_decay = eps_decay self.min_eps = min_eps self.decay_start = decay_start + self.decay_per_step = decay_per_step + self.last_episode = -1 - def decay(self, cur_eps, episode): - if episode < self.decay_start: - return cur_eps + def do_decay(self, episode): + if self.decay_per_step: + do = (episode >= self.decay_start) else: + do = ((self.last_episode != episode) and (episode >= self.decay_start)) + self.last_episode = episode + return do + + def decay(self, cur_eps, episode): + if self.do_decay(episode): return max(cur_eps - self.eps_decay, self.min_eps) + else: + return cur_eps class BaseStrategy(object): @@ -170,3 +190,29 @@ class OrnsteinUhlenbeckStrategy(BaseStrategy): noise = self._evolve_state() action = (1.0 - self.cur_eps) * values + (self.cur_eps * noise) return np.clip(action, self._action_low, self._action_high) + + +class GaussianNoiseStrategy(BaseStrategy): + def __init__( + self, + action_dim, + action_low, + action_high, + eps, + noise_variance, + decay=NoDecay() + ): + super(GaussianNoiseStrategy, self).__init__(decay) + self.eps = eps + self.cur_eps = eps + + self._action_dim = action_dim + self._action_low = action_low + self._action_high = action_high + + self._noise_variance = noise_variance + + def select_action(self, values): + noise = np.random.normal(loc=0.0, scale=self._noise_variance, size=self._action_dim) + action = values + self.cur_eps * noise + return np.clip(action, self._action_low, self._action_high) diff --git a/src/main/resources/templates/gluon/reinforcement/architecture/CriticArchitecture.ftl b/src/main/resources/templates/gluon/reinforcement/architecture/CriticArchitecture.ftl deleted file mode 100644 index c923c08b41ca0b4b90c25ba58712d649eb0819f9..0000000000000000000000000000000000000000 --- a/src/main/resources/templates/gluon/reinforcement/architecture/CriticArchitecture.ftl +++ /dev/null @@ -1,7 +0,0 @@ -architecture ${architectureName}() { - def input ${stateType}<#if stateRange??>(<#if stateRange.isLowerLimitInfinity()>-oo<#else>${stateRange.lowerLimit.get()}</#if>:<#if stateRange.isUpperLimitInfinity()>oo<#else>${stateRange.upperLimit.get()}</#if>)</#if>^{<#list stateDimension as d>${d}<#if d?has_next>,</#if></#list>} state - def input ${actionType}<#if actionRange??>(<#if actionRange.isLowerLimitInfinity()>-oo<#else>${actionRange.lowerLimit.get()}</#if>:<#if actionRange.isUpperLimitInfinity()>oo<#else>${actionRange.upperLimit.get()}</#if>)</#if>^{<#list actionDimension as d>${d}<#if d?has_next>,</#if></#list>} action - def output Q(-oo:oo)^{1} qvalue - - ${implementation}->FullyConnected(units=1)->qvalue; -} \ No newline at end of file diff --git a/src/main/resources/templates/gluon/reinforcement/environment/Environment.ftl b/src/main/resources/templates/gluon/reinforcement/environment/Environment.ftl index cc02e137810327dad6d8d4dcadc88a5bc169c639..04e6cd3e3592781d662df536ff55efa67824ef41 100644 --- a/src/main/resources/templates/gluon/reinforcement/environment/Environment.ftl +++ b/src/main/resources/templates/gluon/reinforcement/environment/Environment.ftl @@ -151,7 +151,6 @@ class RosEnvironment(Environment): def reset(self): self.__in_reset = True - time.sleep(0.5) reset_message = Bool() reset_message.data = True self.__waiting_for_state_update = True @@ -187,7 +186,8 @@ class RosEnvironment(Environment): next_state = self.__last_received_state terminal = self.__last_received_terminal reward = <#if config.hasRosRewardTopic()>self.__last_received_reward<#else>self.__calc_reward(next_state, terminal)</#if> - rospy.logdebug('Calculated reward: {}'.format(reward)) + + logger.debug('Transition: ({}, {}, {}, {})'.format(action, reward, next_state, terminal)) return next_state, reward, terminal, 0 @@ -206,25 +206,24 @@ class RosEnvironment(Environment): else: rospy.logerr("Timeout 3 times in a row: Terminate application") exit() - time.sleep(100/1000) + time.sleep(1/500) def close(self): rospy.signal_shutdown('Program ended!') def __state_callback(self, data): self.__last_received_state = np.array(data.data, dtype='float32').reshape((<#list config.stateDim as d>${d},</#list>)) - rospy.logdebug('Received state: {}'.format(self.__last_received_state)) + logger.debug('Received state: {}'.format(self.__last_received_state)) self.__waiting_for_state_update = False def __terminal_state_callback(self, data): - self.__last_received_terminal = data.data - rospy.logdebug('Received terminal flag: {}'.format(self.__last_received_terminal)) - logger.debug('Received terminal: {}'.format(self.__last_received_terminal)) + self.__last_received_terminal = np.bool(data.data) + logger.debug('Received terminal flag: {}'.format(self.__last_received_terminal)) self.__waiting_for_terminal_update = False <#if config.hasRosRewardTopic()> def __reward_callback(self, data): - self.__last_received_reward = float(data.data) + self.__last_received_reward = np.float32(data.data) logger.debug('Received reward: {}'.format(self.__last_received_reward)) self.__waiting_for_reward_update = False <#else> diff --git a/src/main/resources/templates/gluon/reinforcement/params/DdpgAgentParams.ftl b/src/main/resources/templates/gluon/reinforcement/params/DdpgAgentParams.ftl index ec22fc2f6481047ce62c245780335b4576975d31..3894014b9e73ad97600ce67b61481a9454e369c9 100644 --- a/src/main/resources/templates/gluon/reinforcement/params/DdpgAgentParams.ftl +++ b/src/main/resources/templates/gluon/reinforcement/params/DdpgAgentParams.ftl @@ -1,5 +1,5 @@ - 'actor': actor_creator.net, - 'critic': critic_creator.net, + 'actor': actor_creator.networks[0], + 'critic': critic_creator.networks[0], <#if (config.softTargetUpdateRate)??> 'soft_target_update_rate': ${config.softTargetUpdateRate}, </#if> diff --git a/src/main/resources/templates/gluon/reinforcement/params/DqnAgentParams.ftl b/src/main/resources/templates/gluon/reinforcement/params/DqnAgentParams.ftl index 14bb9e687b5f1aa266c9b3285e33bb6f07f025d1..ff16ea6d49251ede96c17884632eeb473a22589b 100644 --- a/src/main/resources/templates/gluon/reinforcement/params/DqnAgentParams.ftl +++ b/src/main/resources/templates/gluon/reinforcement/params/DqnAgentParams.ftl @@ -1,4 +1,4 @@ - 'qnet':qnet_creator.net, + 'qnet':qnet_creator.networks[0], <#if (config.useFixTargetNetwork)?? && config.useFixTargetNetwork> 'use_fix_target': True, 'target_update_interval': ${config.targetNetworkUpdateInterval}, @@ -6,7 +6,7 @@ 'use_fix_target': False, </#if> <#if (config.configuration.loss)??> - 'loss': '${config.lossName}', + 'loss_function': '${config.lossName}', <#if (config.lossParams)??> 'loss_params': { <#list config.lossParams?keys as param> diff --git a/src/main/resources/templates/gluon/reinforcement/params/ReplayMemoryParams.ftl b/src/main/resources/templates/gluon/reinforcement/params/ReplayMemoryParams.ftl index 944187fbcc1be1e585f3761be4bd1213f5ea3e37..139774ad7ac73b30c5df79da712c6c45501c3bda 100644 --- a/src/main/resources/templates/gluon/reinforcement/params/ReplayMemoryParams.ftl +++ b/src/main/resources/templates/gluon/reinforcement/params/ReplayMemoryParams.ftl @@ -10,5 +10,5 @@ 'method': 'online', </#if> 'state_dtype': 'float32', - 'action_dtype': <#if config.rlAlgorithm=="DQN">'uint8'<#else>'float32'</#if>, + 'action_dtype': <#if config.rlAlgorithm=="dqn">'uint8'<#else>'float32'</#if>, 'rewards_dtype': 'float32' diff --git a/src/main/resources/templates/gluon/reinforcement/params/StrategyParams.ftl b/src/main/resources/templates/gluon/reinforcement/params/StrategyParams.ftl index 836a95111705a0f327f2e275ce64c650f39dd34f..b8ffa903d6c7e88ce1a8a7ffa332de2e41946ca2 100644 --- a/src/main/resources/templates/gluon/reinforcement/params/StrategyParams.ftl +++ b/src/main/resources/templates/gluon/reinforcement/params/StrategyParams.ftl @@ -16,7 +16,15 @@ <#if (config.strategy.epsilon_decay_start)??> 'epsilon_decay_start': ${config.strategy.epsilon_decay_start}, </#if> -<#if (config.strategy.method)?? && (config.strategy.method=="ornstein_uhlenbeck")> +<#if (config.strategy.epsilon_decay_per_step)??> + 'epsilon_decay_per_step': ${config.strategy.epsilon_decay_per_step?string('True', 'False')}, +</#if> +<#if (config.strategy.method=="gaussian")> +<#if (config.strategy.noise_variance)??> + 'noise_variance': ${config.strategy.noise_variance}, +</#if> +</#if> +<#if (config.strategy.method)?? && (config.strategy.method=="ornstein_uhlenbeck" || config.strategy.method=="gaussian")> <#if (config.strategy.action_low)?? > 'action_low': ${config.strategy.action_low}, <#else> @@ -27,6 +35,8 @@ <#else> 'action_high' : np.infty, </#if> +</#if> +<#if (config.strategy.method)?? && (config.strategy.method=="ornstein_uhlenbeck")> <#if (config.strategy.mu)??> 'mu': [<#list config.strategy.mu as m>${m}<#if m?has_next>, </#if></#list>], </#if> diff --git a/src/main/resources/templates/gluon/reinforcement/params/Td3AgentParams.ftl b/src/main/resources/templates/gluon/reinforcement/params/Td3AgentParams.ftl new file mode 100644 index 0000000000000000000000000000000000000000..c410e386962815672751cf2178c253b0bf6cde4d --- /dev/null +++ b/src/main/resources/templates/gluon/reinforcement/params/Td3AgentParams.ftl @@ -0,0 +1,10 @@ +<#include "DdpgAgentParams.ftl"> +<#if (config.policyNoise)??> + 'policy_noise': ${config.policyNoise}, +</#if> +<#if (config.noiseClip)??> + 'noise_clip': ${config.noiseClip}, +</#if> +<#if (config.policyDelay)??> + 'policy_delay': ${config.policyDelay}, +</#if> \ No newline at end of file diff --git a/src/main/resources/templates/gluon/reinforcement/util/Util.ftl b/src/main/resources/templates/gluon/reinforcement/util/Util.ftl index 78e7795fd290f99980c689ad1c76f274a821f830..1d8b38c4a32a1cc6c0e364054e746e9115d7d05b 100644 --- a/src/main/resources/templates/gluon/reinforcement/util/Util.ftl +++ b/src/main/resources/templates/gluon/reinforcement/util/Util.ftl @@ -11,8 +11,8 @@ import cnnarch_logger LOSS_FUNCTIONS = { 'l1': gluon.loss.L1Loss(), - 'euclidean': gluon.loss.L2Loss(), - 'huber_loss': gluon.loss.HuberLoss(), + 'l2': gluon.loss.L2Loss(), + 'huber': gluon.loss.HuberLoss(), 'softmax_cross_entropy': gluon.loss.SoftmaxCrossEntropyLoss(), 'sigmoid_cross_entropy': gluon.loss.SigmoidBinaryCrossEntropyLoss()} diff --git a/src/test/java/de/monticore/lang/monticar/cnnarch/gluongenerator/GenerationTest.java b/src/test/java/de/monticore/lang/monticar/cnnarch/gluongenerator/GenerationTest.java index 309191eb574cb76c75ad12648a491716fbf7387f..d3290488acd0cb11f7c6d8215998e9547915ca5f 100644 --- a/src/test/java/de/monticore/lang/monticar/cnnarch/gluongenerator/GenerationTest.java +++ b/src/test/java/de/monticore/lang/monticar/cnnarch/gluongenerator/GenerationTest.java @@ -20,12 +20,9 @@ */ package de.monticore.lang.monticar.cnnarch.gluongenerator; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.Lists; import de.monticore.lang.monticar.cnnarch.gluongenerator.reinforcement.RewardFunctionSourceGenerator; -import de.monticore.lang.monticar.cnnarch.gluongenerator.util.TrainedArchitectureMockFactory; -import de.monticore.lang.monticar.cnntrain.annotations.Range; -import de.monticore.lang.monticar.cnntrain.annotations.TrainedArchitecture; +import de.monticore.lang.monticar.cnnarch.gluongenerator.util.NNArchitectureMockFactory; +import de.monticore.lang.monticar.cnntrain._symboltable.NNArchitectureSymbol; import de.se_rwth.commons.logging.Finding; import de.se_rwth.commons.logging.Log; import freemarker.template.TemplateException; @@ -37,15 +34,9 @@ import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; -import java.util.List; -import java.util.stream.Collector; -import java.util.stream.Collectors; - -import org.junit.contrib.java.lang.system.Assertion; import org.junit.contrib.java.lang.system.ExpectedSystemExit; import static junit.framework.TestCase.assertTrue; import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; public class GenerationTest extends AbstractSymtabTest { private RewardFunctionSourceGenerator rewardFunctionSourceGenerator; @@ -200,7 +191,7 @@ public class GenerationTest extends AbstractSymtabTest { Log.getFindings().clear(); Path modelPath = Paths.get("src/test/resources/valid_tests"); CNNTrain2Gluon trainGenerator = new CNNTrain2Gluon(rewardFunctionSourceGenerator); - TrainedArchitecture trainedArchitecture = TrainedArchitectureMockFactory.createTrainedArchitectureMock(); + NNArchitectureSymbol trainedArchitecture = NNArchitectureMockFactory.createNNArchitectureMock(); trainGenerator.generate(modelPath, "ReinforcementConfig2", trainedArchitecture); @@ -228,7 +219,7 @@ public class GenerationTest extends AbstractSymtabTest { Log.getFindings().clear(); Path modelPath = Paths.get("src/test/resources/valid_tests"); CNNTrain2Gluon trainGenerator = new CNNTrain2Gluon(rewardFunctionSourceGenerator); - TrainedArchitecture trainedArchitecture = TrainedArchitectureMockFactory.createTrainedArchitectureMock(); + NNArchitectureSymbol trainedArchitecture = NNArchitectureMockFactory.createNNArchitectureMock(); trainGenerator.generate(modelPath, "ReinforcementConfig3", trainedArchitecture); @@ -276,9 +267,11 @@ public class GenerationTest extends AbstractSymtabTest { Log.getFindings().clear(); Path modelPath = Paths.get("src/test/resources/valid_tests/ddpg"); CNNTrain2Gluon trainGenerator = new CNNTrain2Gluon(rewardFunctionSourceGenerator); - TrainedArchitecture trainedArchitecture = TrainedArchitectureMockFactory.createTrainedArchitectureMock(); + NNArchitectureSymbol trainedArchitecture = NNArchitectureMockFactory.createNNArchitectureMock(); + NNArchitectureSymbol criticArchitecture = NNArchitectureMockFactory.createArchitectureSymbolByCNNArchModel( + Paths.get("./src/test/resources/valid_tests/ddpg/comp"), "CriticNetwork"); - trainGenerator.generate(modelPath, "ActorNetwork", trainedArchitecture); + trainGenerator.generate(modelPath, "ActorNetwork", trainedArchitecture, criticArchitecture); assertTrue(Log.getFindings().stream().noneMatch(Finding::isError)); checkFilesAreEqual( @@ -300,14 +293,47 @@ public class GenerationTest extends AbstractSymtabTest { ); } + @Test + public void testTd3Config() { + Log.getFindings().clear(); + Path modelPath = Paths.get("src/test/resources/valid_tests/td3"); + CNNTrain2Gluon trainGenerator = new CNNTrain2Gluon(rewardFunctionSourceGenerator); + NNArchitectureSymbol trainedArchitecture = NNArchitectureMockFactory.createNNArchitectureMock(); + NNArchitectureSymbol criticArchitecture = NNArchitectureMockFactory.createArchitectureSymbolByCNNArchModel( + Paths.get("./src/test/resources/valid_tests/td3/comp"), "CriticNetwork"); + + trainGenerator.generate(modelPath, "TD3Config", trainedArchitecture, criticArchitecture); + + assertTrue(Log.getFindings().stream().noneMatch(Finding::isError)); + checkFilesAreEqual( + Paths.get("./target/generated-sources-cnnarch"), + Paths.get("./src/test/resources/target_code/td3"), + Arrays.asList( + "CNNTrainer_tD3Config.py", + "start_training.sh", + "reinforcement_learning/CNNCreator_CriticNetwork.py", + "reinforcement_learning/CNNNet_CriticNetwork.py", + "reinforcement_learning/__init__.py", + "reinforcement_learning/strategy.py", + "reinforcement_learning/agent.py", + "reinforcement_learning/environment.py", + "reinforcement_learning/replay_memory.py", + "reinforcement_learning/util.py", + "reinforcement_learning/cnnarch_logger.py" + ) + ); + } + @Test public void testRosDdpgConfig() { Log.getFindings().clear(); Path modelPath = Paths.get("src/test/resources/valid_tests/ddpg-ros"); CNNTrain2Gluon trainGenerator = new CNNTrain2Gluon(rewardFunctionSourceGenerator); - TrainedArchitecture trainedArchitecture = TrainedArchitectureMockFactory.createTrainedArchitectureMock(); + NNArchitectureSymbol trainedArchitecture = NNArchitectureMockFactory.createNNArchitectureMock(); + NNArchitectureSymbol criticArchitecture = NNArchitectureMockFactory.createArchitectureSymbolByCNNArchModel( + Paths.get("./src/test/resources/valid_tests/ddpg-ros/comp"), "RosCriticNetwork"); - trainGenerator.generate(modelPath, "RosActorNetwork", trainedArchitecture); + trainGenerator.generate(modelPath, "RosActorNetwork", trainedArchitecture, criticArchitecture); assertTrue(Log.getFindings().stream().noneMatch(Finding::isError)); checkFilesAreEqual( diff --git a/src/test/java/de/monticore/lang/monticar/cnnarch/gluongenerator/IntegrationPythonWrapperTest.java b/src/test/java/de/monticore/lang/monticar/cnnarch/gluongenerator/IntegrationPythonWrapperTest.java deleted file mode 100644 index eb2602b538c5b1e1ba1a15e2291d872e46cf69dc..0000000000000000000000000000000000000000 --- a/src/test/java/de/monticore/lang/monticar/cnnarch/gluongenerator/IntegrationPythonWrapperTest.java +++ /dev/null @@ -1,59 +0,0 @@ -package de.monticore.lang.monticar.cnnarch.gluongenerator; - -import de.monticore.lang.monticar.cnnarch.gluongenerator.reinforcement.RewardFunctionSourceGenerator; -import de.monticore.lang.monticar.cnnarch.gluongenerator.util.TrainedArchitectureMockFactory; -import de.monticore.lang.monticar.cnntrain.annotations.TrainedArchitecture; -import de.se_rwth.commons.logging.Finding; -import de.se_rwth.commons.logging.Log; -import org.junit.Before; -import org.junit.Ignore; -import org.junit.Test; - -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.Arrays; -import java.util.stream.Collectors; - -import static junit.framework.TestCase.assertTrue; -import static org.mockito.Mockito.mock; - -public class IntegrationPythonWrapperTest extends AbstractSymtabTest{ - private RewardFunctionSourceGenerator rewardFunctionSourceGenerator; - - @Before - public void setUp() { - // ensure an empty log - Log.getFindings().clear(); - Log.enableFailQuick(false); - rewardFunctionSourceGenerator = mock(RewardFunctionSourceGenerator.class); - } - - @Test - public void testReinforcementConfigWithRewardGeneration() { - Log.getFindings().clear(); - Path modelPath = Paths.get("src/test/resources/valid_tests"); - CNNTrain2Gluon trainGenerator = new CNNTrain2Gluon(rewardFunctionSourceGenerator); - - TrainedArchitecture trainedArchitecture = TrainedArchitectureMockFactory.createTrainedArchitectureMock(); - - trainGenerator.generate(modelPath, "ReinforcementConfig1", trainedArchitecture); - - assertTrue(Log.getFindings().stream().filter(Finding::isError).collect(Collectors.toList()).isEmpty()); - checkFilesAreEqual( - Paths.get("./target/generated-sources-cnnarch"), - Paths.get("./src/test/resources/target_code/ReinforcementConfig1"), - Arrays.asList( - "CNNTrainer_reinforcementConfig1.py", - "start_training.sh", - "reinforcement_learning/__init__.py", - "reinforcement_learning/strategy.py", - "reinforcement_learning/agent.py", - "reinforcement_learning/environment.py", - "reinforcement_learning/replay_memory.py", - "reinforcement_learning/util.py", - "reinforcement_learning/cnnarch_logger.py") - ); - assertTrue(Paths.get("./target/generated-sources-cnnarch/reward/pylib").toFile().isDirectory()); - } - -} diff --git a/src/test/java/de/monticore/lang/monticar/cnnarch/gluongenerator/util/TrainedArchitectureMockFactory.java b/src/test/java/de/monticore/lang/monticar/cnnarch/gluongenerator/util/NNArchitectureMockFactory.java similarity index 53% rename from src/test/java/de/monticore/lang/monticar/cnnarch/gluongenerator/util/TrainedArchitectureMockFactory.java rename to src/test/java/de/monticore/lang/monticar/cnnarch/gluongenerator/util/NNArchitectureMockFactory.java index 7470b9dd8f5e564d5a6c242fca3bdfbfaa57a37c..0a690724710466026af29da9920016f8a50ca9fd 100644 --- a/src/test/java/de/monticore/lang/monticar/cnnarch/gluongenerator/util/TrainedArchitectureMockFactory.java +++ b/src/test/java/de/monticore/lang/monticar/cnnarch/gluongenerator/util/NNArchitectureMockFactory.java @@ -2,19 +2,25 @@ package de.monticore.lang.monticar.cnnarch.gluongenerator.util; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; +import de.monticore.lang.monticar.cnnarch._symboltable.ArchitectureSymbol; +import de.monticore.lang.monticar.cnnarch.generator.CNNArchSymbolCompiler; +import de.monticore.lang.monticar.cnnarch.gluongenerator.CNNArch2GluonArchitectureSupportChecker; +import de.monticore.lang.monticar.cnnarch.gluongenerator.CNNArch2GluonLayerSupportChecker; +import de.monticore.lang.monticar.cnnarch.gluongenerator.annotations.ArchitectureAdapter; +import de.monticore.lang.monticar.cnntrain._symboltable.NNArchitectureSymbol; import de.monticore.lang.monticar.cnntrain.annotations.Range; -import de.monticore.lang.monticar.cnntrain.annotations.TrainedArchitecture; +import java.nio.file.Path; import java.util.List; import java.util.Map; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; -public class TrainedArchitectureMockFactory { +public class NNArchitectureMockFactory { - public static TrainedArchitecture createTrainedArchitectureMock() { - TrainedArchitecture trainedArchitecture = mock(TrainedArchitecture.class); + public static NNArchitectureSymbol createNNArchitectureMock() { + NNArchitectureSymbol trainedArchitecture = mock(NNArchitectureSymbol.class); final String inputName = "state"; final String outputName = "action"; @@ -45,4 +51,12 @@ public class TrainedArchitectureMockFactory { return trainedArchitecture; } + public static NNArchitectureSymbol createArchitectureSymbolByCNNArchModel(final Path modelsDirPath, + final String rootModel) { + CNNArchSymbolCompiler symbolCompiler = new CNNArchSymbolCompiler(new CNNArch2GluonArchitectureSupportChecker(), + new CNNArch2GluonLayerSupportChecker()); + ArchitectureSymbol architectureSymbol = symbolCompiler.compileArchitectureSymbolFromModelsDir(modelsDirPath, rootModel); + architectureSymbol.setComponentName(rootModel); + return new ArchitectureAdapter(rootModel, architectureSymbol); + } } diff --git a/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/agent.py b/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/agent.py index 6dbf9ef17065c25832844649b1c3bb16009f4e6f..148c89848b8d990b5f635c3976fd15e71a608893 100644 --- a/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/agent.py +++ b/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/agent.py @@ -124,10 +124,10 @@ class Agent(object): def _make_pickle_ready(self, session_dir): del self._training_stats.logger - self._logger = None self._environment.close() self._environment = None - self._save_net(self._best_net, 'best_net', session_dir) + self._export_net(self._best_net, 'best_net', filedir=session_dir) + self._logger = None self._best_net = None def _make_config_dict(self): @@ -258,26 +258,22 @@ class Agent(object): return self._target_score is not None\ and avg_reward > self._target_score - def _save_parameters(self, net, episode=None, filename='dqn-agent-params'): + def _export_net(self, net, filename, filedir=None, episode=None): assert self._output_directory assert isinstance(net, gluon.HybridBlock) make_directory_if_not_exist(self._output_directory) + filedir = self._output_directory if filedir is None else filedir + filename = os.path.join(filedir, filename) - if(episode is not None): - self._logger.info( - 'Saving model parameters after episode %d' % episode) + if episode is not None: filename = filename + '-ep{}'.format(episode) - else: - self._logger.info('Saving model parameters') - self._save_net(net, filename) - def _save_net(self, net, filename, filedir=None): - filedir = self._output_directory if filedir is None else filedir - filename = os.path.join(filedir, filename) - net.save_parameters(filename + '.params') net.export(filename, epoch=0) + net.save_parameters(filename + '.params') - def save_best_network(self, path, epoch=0): + def export_best_network(self, path=None, epoch=0): + path = os.path.join(self._output_directory, 'best_network')\ + if path is None else path self._logger.info( 'Saving best network with average reward of {}'.format( self._best_avg_score)) @@ -373,15 +369,17 @@ class DdpgAgent(Agent): def _make_pickle_ready(self, session_dir): super(DdpgAgent, self)._make_pickle_ready(session_dir) - self._save_net(self._actor, 'current_actor') + self._export_net(self._actor, 'current_actor') - self._save_net(self._actor, 'actor', session_dir) + self._export_net(self._actor, 'actor', filedir=session_dir) self._actor = None - self._save_net(self._critic, 'critic', session_dir) + self._export_net(self._critic, 'critic', filedir=session_dir) self._critic = None - self._save_net(self._actor_target, 'actor_target', session_dir) + self._export_net( + self._actor_target, 'actor_target', filedir=session_dir) self._actor_target = None - self._save_net(self._critic_target, 'critic_target', session_dir) + self._export_net( + self._critic_target, 'critic_target', filedir=session_dir) self._critic_target = None @classmethod @@ -449,7 +447,8 @@ class DdpgAgent(Agent): return action[0].asnumpy() def save_parameters(self, episode): - self._save_parameters(self._actor, episode=episode) + self._export_net( + self._actor, self._agent_name + '_actor', episode=episode) def train(self, episodes=None): self.save_config_file() @@ -504,6 +503,7 @@ class DdpgAgent(Agent): # actor and exploration noise N according to strategy action = self._strategy.select_action( self.get_next_action(state)) + self._strategy.decay(self._current_episode) # Execute action a and observe reward r and next state ns next_state, reward, terminal, _ = \ @@ -545,10 +545,10 @@ class DdpgAgent(Agent): # with critic parameters tmp_critic = self._copy_critic() with autograd.record(): - actor_qvalues = tmp_critic(states, self._actor(states)) # For maximizing qvalues we have to multiply with -1 # as we use a minimizer - actor_loss = -1 * actor_qvalues.mean() + actor_loss = -tmp_critic( + states, self._actor(states)).mean() actor_loss.backward() trainer_actor.step(self._minibatch_size) @@ -566,7 +566,7 @@ class DdpgAgent(Agent): episode_actor_loss +=\ np.sum(actor_loss.asnumpy()) / self._minibatch_size episode_avg_q_value +=\ - np.sum(actor_qvalues.asnumpy()) / self._minibatch_size + np.sum(qvalues.asnumpy()) / self._minibatch_size training_steps += 1 @@ -594,7 +594,6 @@ class DdpgAgent(Agent): self._strategy.cur_eps, episode_reward) self._do_snapshot_if_in_interval(self._current_episode) - self._strategy.decay(self._current_episode) if self._is_target_reached(avg_reward): self._logger.info( @@ -605,7 +604,7 @@ class DdpgAgent(Agent): self._evaluate() self.save_parameters(episode=self._current_episode) - self.save_best_network(os.path.join(self._output_directory, 'best')) + self.export_best_network() self._training_stats.save_stats(self._output_directory) self._logger.info('--------- Training finished ---------') return True @@ -641,6 +640,359 @@ class DdpgAgent(Agent): self._critic, self._state_dim, self._action_dim, ctx=self._ctx) +class TwinDelayedDdpgAgent(DdpgAgent): + def __init__( + self, + actor, + critic, + environment, + replay_memory_params, + strategy_params, + state_dim, + action_dim, + soft_target_update_rate=.001, + actor_optimizer='adam', + actor_optimizer_params={'learning_rate': 0.0001}, + critic_optimizer='adam', + critic_optimizer_params={'learning_rate': 0.001}, + ctx=None, + discount_factor=.9, + training_episodes=50, + start_training=20, + train_interval=1, + snapshot_interval=200, + agent_name='DdpgAgent', + max_episode_step=9999, + evaluation_samples=100, + output_directory='model_parameters', + verbose=True, + target_score=None, + policy_noise=0.2, + noise_clip=0.5, + policy_delay=2 + ): + super(TwinDelayedDdpgAgent, self).__init__( + environment=environment, replay_memory_params=replay_memory_params, + strategy_params=strategy_params, state_dim=state_dim, + action_dim=action_dim, ctx=ctx, discount_factor=discount_factor, + training_episodes=training_episodes, start_training=start_training, + train_interval=train_interval, + snapshot_interval=snapshot_interval, agent_name=agent_name, + max_episode_step=max_episode_step, + output_directory=output_directory, verbose=verbose, + target_score=target_score, evaluation_samples=evaluation_samples, + critic=critic, soft_target_update_rate=soft_target_update_rate, + actor=actor, actor_optimizer=actor_optimizer, + actor_optimizer_params=actor_optimizer_params, + critic_optimizer=critic_optimizer, + critic_optimizer_params=critic_optimizer_params) + + self._policy_noise = policy_noise + self._noise_clip = noise_clip + self._policy_delay = policy_delay + + self._critic2 = self._critic.__class__() + self._critic2.collect_params().initialize( + mx.init.Normal(), ctx=self._ctx) + self._critic2.hybridize() + self._critic2(nd.ones((1,) + state_dim, ctx=self._ctx), + nd.ones((1,) + action_dim, ctx=self._ctx)) + + self._critic2_target = self._copy_critic2() + + self._critic2_optimizer = critic_optimizer + self._critic2_optimizer_params = self._adjust_optimizer_params( + critic_optimizer_params) + + def _make_pickle_ready(self, session_dir): + super(TwinDelayedDdpgAgent, self)._make_pickle_ready(session_dir) + self._export_net(self._critic2, 'critic2', filedir=session_dir) + self._critic2 = None + self._export_net( + self._critic2_target, 'critic2_target', filedir=session_dir) + self._critic2_target = None + + @classmethod + def resume_from_session(cls, session_dir, actor, critic, environment): + import pickle + if not os.path.exists(session_dir): + raise ValueError('Session directory does not exist') + + files = dict() + files['agent'] = os.path.join(session_dir, 'agent.p') + files['best_net_params'] = os.path.join(session_dir, 'best_net.params') + files['actor_net_params'] = os.path.join(session_dir, 'actor.params') + files['actor_target_net_params'] = os.path.join( + session_dir, 'actor_target.params') + files['critic_net_params'] = os.path.join(session_dir, 'critic.params') + files['critic_target_net_params'] = os.path.join( + session_dir, 'critic_target.params') + files['critic2_net_params'] = os.path.join( + session_dir, 'critic2.params') + files['critic2_target_net_params'] = os.path.join( + session_dir, 'critic2_target.params') + + for file in files.values(): + if not os.path.exists(file): + raise ValueError( + 'Session directory is not complete: {} is missing' + .format(file)) + + with open(files['agent'], 'rb') as f: + agent = pickle.load(f) + + agent._environment = environment + + agent._actor = actor + agent._actor.load_parameters(files['actor_net_params'], agent._ctx) + agent._actor.hybridize() + agent._actor(nd.random_normal( + shape=((1,) + agent._state_dim), ctx=agent._ctx)) + + agent._best_net = copy_net(agent._actor, agent._state_dim, agent._ctx) + agent._best_net.load_parameters(files['best_net_params'], agent._ctx) + + agent._actor_target = copy_net( + agent._actor, agent._state_dim, agent._ctx) + agent._actor_target.load_parameters(files['actor_target_net_params']) + + agent._critic = critic + agent._critic.load_parameters(files['critic_net_params'], agent._ctx) + agent._critic.hybridize() + agent._critic( + nd.random_normal(shape=((1,) + agent._state_dim), ctx=agent._ctx), + nd.random_normal(shape=((1,) + agent._action_dim), ctx=agent._ctx)) + + agent._critic_target = copy_net_with_two_inputs( + agent._critic, agent._state_dim, agent._action_dim, agent._ctx) + agent._critic_target.load_parameters(files['critic_target_net_params']) + + agent._critic2 = copy_net_with_two_inputs( + agent._critic, agent._state_dim, agent._action_dim, agent._ctx) + agent._critic2.load_parameters(files['critic2_net_params'], agent._ctx) + agent._critic2.hybridize() + agent._critic2( + nd.random_normal(shape=((1,) + agent._state_dim), ctx=agent._ctx), + nd.random_normal(shape=((1,) + agent._action_dim), ctx=agent._ctx)) + + agent._critic2_target = copy_net_with_two_inputs( + agent._critic2, agent._state_dim, agent._action_dim, agent._ctx) + agent._critic2_target.load_parameters( + files['critic2_target_net_params']) + + agent._logger = ArchLogger.get_logger() + agent._training_stats.logger = ArchLogger.get_logger() + agent._logger.info('Agent was retrieved; Training can be continued') + + return agent + + def _copy_critic2(self): + assert self._critic2 is not None + assert self._ctx is not None + assert type(self._state_dim) is tuple + assert type(self._action_dim) is tuple + return copy_net_with_two_inputs( + self._critic2, self._state_dim, self._action_dim, ctx=self._ctx) + + def train(self, episodes=None): + self.save_config_file() + self._logger.info("--- Start TwinDelayedDDPG training ---") + episodes = \ + episodes if episodes is not None else self._training_episodes + + resume = (self._current_episode > 0) + if resume: + self._logger.info("Training session resumed") + self._logger.info( + "Starting from episode {}".format(self._current_episode)) + else: + self._training_stats = DdpgTrainingStats(episodes) + + # Initialize target Q1' and Q2' and mu' + self._actor_target = self._copy_actor() + self._critic_target = self._copy_critic() + self._critic2_target = self._copy_critic2() + + # Initialize l2 loss for critic network + l2_loss = gluon.loss.L2Loss() + + # Initialize critic and actor trainer + trainer_actor = gluon.Trainer( + self._actor.collect_params(), self._actor_optimizer, + self._actor_optimizer_params) + trainer_critic = gluon.Trainer( + self._critic.collect_params(), self._critic_optimizer, + self._critic_optimizer_params) + trainer_critic2 = gluon.Trainer( + self._critic2.collect_params(), self._critic2_optimizer, + self._critic2_optimizer_params) + + # For episode=1..n + while self._current_episode < episodes: + # Check interrupt flag + if self._check_interrupt_routine(): + return False + + # Initialize new episode + step = 0 + episode_reward = 0 + start = time.time() + episode_critic_loss = 0 + episode_actor_loss = 0 + episode_avg_q_value = 0 + training_steps = 0 + actor_updates = 0 + + # Get initialial observation state s + state = self._environment.reset() + + # For step=1..T + while step < self._max_episode_step: + # Select an action a = mu(s) + N(step) according to current + # actor and exploration noise N according to strategy + action = self._strategy.select_action( + self.get_next_action(state)) + self._strategy.decay(self._current_episode) + + # Execute action a and observe reward r and next state ns + next_state, reward, terminal, _ = \ + self._environment.step(action) + + self._logger.debug( + 'Applied action {} with reward {}'.format(action, reward)) + + # Store transition (s,a,r,ns) in replay buffer + self._memory.append( + state, action, reward, next_state, terminal) + + if self._do_training(): + # Sample random minibatch of b transitions + # (s_i, a_i, r_i, s_(i+1)) from replay buffer + states, actions, rewards, next_states, terminals =\ + self._sample_from_memory() + + clipped_noise = nd.array( + np.clip( + np.random.normal( + loc=0, scale=self._policy_noise, + size=self._minibatch_size + ).reshape(self._minibatch_size, 1), + -self._noise_clip, + self._noise_clip + ), + ctx=self._ctx + ) + target_action = np.clip( + self._actor_target(next_states) + clipped_noise, + self._strategy._action_low, + self._strategy._action_high) + + rewards = rewards.reshape(self._minibatch_size, 1) + terminals = terminals.reshape(self._minibatch_size, 1) + + target_qvalues1 = self._critic_target(next_states, + target_action) + target_qvalues2 = self._critic2_target(next_states, + target_action) + target_qvalues = nd.minimum(target_qvalues1, + target_qvalues2) + y = rewards + (1 - terminals) * self._discount_factor\ + * target_qvalues + + with autograd.record(): + qvalues1 = self._critic(states, actions) + critic1_loss = l2_loss(qvalues1, y) + critic1_loss.backward() + trainer_critic.step(self._minibatch_size) + + with autograd.record(): + qvalues2 = self._critic2(states, actions) + critic2_loss = l2_loss(qvalues2, y) + critic2_loss.backward() + trainer_critic2.step(self._minibatch_size) + + critic_loss = (critic1_loss.mean() + critic2_loss.mean())/2 + + if self._total_steps % self._policy_delay == 0: + tmp_critic = self._copy_critic() + with autograd.record(): + actor_loss = -tmp_critic( + states, self._actor(states)).mean() + actor_loss.backward() + trainer_actor.step(self._minibatch_size) + + # Update target networks: + self._actor_target = self._soft_update( + self._actor, self._actor_target, + self._soft_target_update_rate) + self._critic_target = self._soft_update( + self._critic, self._critic_target, + self._soft_target_update_rate) + self._critic2_target = self._soft_update( + self._critic2, self._critic2_target, + self._soft_target_update_rate) + + actor_updates = actor_updates + 1 + else: + actor_loss = nd.array([0], ctx=self._ctx) + + # Update statistics + episode_critic_loss +=\ + np.sum(critic_loss.asnumpy()) / self._minibatch_size + episode_actor_loss += 0 if actor_updates == 0 else\ + np.sum(actor_loss.asnumpy()[0]) + episode_avg_q_value +=\ + np.sum(target_qvalues.asnumpy()) / self._minibatch_size + + training_steps += 1 + + episode_reward += reward + step += 1 + self._total_steps += 1 + state = next_state + + if terminal: + # Reset the strategy + self._strategy.reset() + break + + # Log the episode results + episode_actor_loss = 0 if actor_updates == 0\ + else (episode_actor_loss / actor_updates) + episode_critic_loss = 0 if training_steps == 0\ + else (episode_critic_loss / training_steps) + episode_avg_q_value = 0 if actor_updates == 0\ + else (episode_avg_q_value / training_steps) + + avg_reward = self._training_stats.log_episode( + self._current_episode, start, training_steps, + episode_actor_loss, episode_critic_loss, episode_avg_q_value, + self._strategy.cur_eps, episode_reward) + + self._do_snapshot_if_in_interval(self._current_episode) + + if self._is_target_reached(avg_reward): + self._logger.info( + 'Target score is reached in average; Training is stopped') + break + + self._current_episode += 1 + + self._evaluate() + self.save_parameters(episode=self._current_episode) + self.export_best_network() + self._training_stats.save_stats(self._output_directory) + self._logger.info('--------- Training finished ---------') + return True + + def _make_config_dict(self): + config = super(TwinDelayedDdpgAgent, self)._make_config_dict() + config['policy_noise'] = self._policy_noise + config['noise_clip'] = self._noise_clip + config['policy_delay'] = self._policy_delay + return config + + class DqnAgent(Agent): def __init__( self, @@ -652,7 +1004,7 @@ class DqnAgent(Agent): action_dim, ctx=None, discount_factor=.9, - loss_function='euclidean', + loss_function='l2', optimizer='rmsprop', optimizer_params={'learning_rate': 0.09}, training_episodes=50, @@ -691,6 +1043,14 @@ class DqnAgent(Agent): self._double_dqn = double_dqn self._use_fix_target = use_fix_target + # Build memory buffer for discrete actions + replay_memory_params['state_dim'] = state_dim + replay_memory_params['action_dim'] = (1,) + self._replay_memory_params = replay_memory_params + rm_builder = ReplayMemoryBuilder() + self._memory = rm_builder.build_by_params(**replay_memory_params) + self._minibatch_size = self._memory.sample_size + # Initialize best network self._best_net = copy_net(self._qnet, self._state_dim, self._ctx) self._best_avg_score = -np.infty @@ -740,10 +1100,10 @@ class DqnAgent(Agent): def _make_pickle_ready(self, session_dir): super(DqnAgent, self)._make_pickle_ready(session_dir) - self._save_net(self._qnet, 'current_qnet') - self._save_net(self._qnet, 'qnet', session_dir) + self._export_net(self._qnet, 'current_qnet') + self._export_net(self._qnet, 'qnet', filedir=session_dir) self._qnet = None - self._save_net(self._target_qnet, 'target_net', session_dir) + self._export_net(self._target_qnet, 'target_net', filedir=session_dir) self._target_qnet = None def get_q_values(self, state, with_best=False): @@ -839,6 +1199,7 @@ class DqnAgent(Agent): # 1. Choose an action based on current game state and policy q_values = self._qnet(nd.array([state], ctx=self._ctx)) action = self._strategy.select_action(q_values[0]) + self._strategy.decay(self._current_episode) # 2. Play the game for a single step next_state, reward, terminal, _ =\ @@ -846,7 +1207,7 @@ class DqnAgent(Agent): # 3. Store transition in replay memory self._memory.append( - state, action, reward, next_state, terminal) + state, [action], reward, next_state, terminal) # 4. Train the network if in interval if self._do_training(): @@ -875,8 +1236,6 @@ class DqnAgent(Agent): self._current_episode, start, training_steps, episode_loss, self._strategy.cur_eps, episode_reward) - self._strategy.decay(self._current_episode) - if self._is_target_reached(avg_reward): self._logger.info( 'Target score is reached in average; Training is stopped') @@ -886,7 +1245,7 @@ class DqnAgent(Agent): self._evaluate() self.save_parameters(episode=self._current_episode) - self.save_best_network(os.path.join(self._output_directory, 'best')) + self.export_best_network() self._training_stats.save_stats(self._output_directory) self._logger.info('--------- Training finished ---------') return True @@ -902,7 +1261,8 @@ class DqnAgent(Agent): return config def save_parameters(self, episode): - self._save_parameters(self._qnet, episode=episode) + self._export_net( + self._qnet, self._agent_name + '_qnet', episode=episode) def _save_current_as_best_net(self): self._best_net = copy_net( diff --git a/src/test/resources/target_code/ReinforcementConfig2/CNNTrainer_reinforcementConfig2.py b/src/test/resources/target_code/ReinforcementConfig2/CNNTrainer_reinforcementConfig2.py index 162301a67a00804dd835c8574b8a6773d733abb3..d7c3cc96ead612b22b702a88975643285d2ceb55 100644 --- a/src/test/resources/target_code/ReinforcementConfig2/CNNTrainer_reinforcementConfig2.py +++ b/src/test/resources/target_code/ReinforcementConfig2/CNNTrainer_reinforcementConfig2.py @@ -56,7 +56,7 @@ if __name__ == "__main__": 'memory_size': 10000, 'sample_size': 32, 'state_dtype': 'float32', - 'action_dtype': 'float32', + 'action_dtype': 'uint8', 'rewards_dtype': 'float32' }, 'strategy_params': { @@ -79,9 +79,9 @@ if __name__ == "__main__": 'max_episode_step': 250, 'evaluation_samples': 100, 'target_score': 185.5, - 'qnet':qnet_creator.net, + 'qnet':qnet_creator.networks[0], 'use_fix_target': False, - 'loss': 'l2', + 'loss_function': 'l2', 'optimizer': 'rmsprop', 'optimizer_params': { 'weight_decay': 0.01, @@ -120,4 +120,4 @@ if __name__ == "__main__": train_successful = agent.train() if train_successful: - agent.save_best_network(qnet_creator._model_dir_ + qnet_creator._model_prefix_ + '_0_newest', epoch=0) + agent.export_best_network(path=qnet_creator._model_dir_ + qnet_creator._model_prefix_ + '_0_newest', epoch=0) diff --git a/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/agent.py b/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/agent.py index 6dbf9ef17065c25832844649b1c3bb16009f4e6f..148c89848b8d990b5f635c3976fd15e71a608893 100644 --- a/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/agent.py +++ b/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/agent.py @@ -124,10 +124,10 @@ class Agent(object): def _make_pickle_ready(self, session_dir): del self._training_stats.logger - self._logger = None self._environment.close() self._environment = None - self._save_net(self._best_net, 'best_net', session_dir) + self._export_net(self._best_net, 'best_net', filedir=session_dir) + self._logger = None self._best_net = None def _make_config_dict(self): @@ -258,26 +258,22 @@ class Agent(object): return self._target_score is not None\ and avg_reward > self._target_score - def _save_parameters(self, net, episode=None, filename='dqn-agent-params'): + def _export_net(self, net, filename, filedir=None, episode=None): assert self._output_directory assert isinstance(net, gluon.HybridBlock) make_directory_if_not_exist(self._output_directory) + filedir = self._output_directory if filedir is None else filedir + filename = os.path.join(filedir, filename) - if(episode is not None): - self._logger.info( - 'Saving model parameters after episode %d' % episode) + if episode is not None: filename = filename + '-ep{}'.format(episode) - else: - self._logger.info('Saving model parameters') - self._save_net(net, filename) - def _save_net(self, net, filename, filedir=None): - filedir = self._output_directory if filedir is None else filedir - filename = os.path.join(filedir, filename) - net.save_parameters(filename + '.params') net.export(filename, epoch=0) + net.save_parameters(filename + '.params') - def save_best_network(self, path, epoch=0): + def export_best_network(self, path=None, epoch=0): + path = os.path.join(self._output_directory, 'best_network')\ + if path is None else path self._logger.info( 'Saving best network with average reward of {}'.format( self._best_avg_score)) @@ -373,15 +369,17 @@ class DdpgAgent(Agent): def _make_pickle_ready(self, session_dir): super(DdpgAgent, self)._make_pickle_ready(session_dir) - self._save_net(self._actor, 'current_actor') + self._export_net(self._actor, 'current_actor') - self._save_net(self._actor, 'actor', session_dir) + self._export_net(self._actor, 'actor', filedir=session_dir) self._actor = None - self._save_net(self._critic, 'critic', session_dir) + self._export_net(self._critic, 'critic', filedir=session_dir) self._critic = None - self._save_net(self._actor_target, 'actor_target', session_dir) + self._export_net( + self._actor_target, 'actor_target', filedir=session_dir) self._actor_target = None - self._save_net(self._critic_target, 'critic_target', session_dir) + self._export_net( + self._critic_target, 'critic_target', filedir=session_dir) self._critic_target = None @classmethod @@ -449,7 +447,8 @@ class DdpgAgent(Agent): return action[0].asnumpy() def save_parameters(self, episode): - self._save_parameters(self._actor, episode=episode) + self._export_net( + self._actor, self._agent_name + '_actor', episode=episode) def train(self, episodes=None): self.save_config_file() @@ -504,6 +503,7 @@ class DdpgAgent(Agent): # actor and exploration noise N according to strategy action = self._strategy.select_action( self.get_next_action(state)) + self._strategy.decay(self._current_episode) # Execute action a and observe reward r and next state ns next_state, reward, terminal, _ = \ @@ -545,10 +545,10 @@ class DdpgAgent(Agent): # with critic parameters tmp_critic = self._copy_critic() with autograd.record(): - actor_qvalues = tmp_critic(states, self._actor(states)) # For maximizing qvalues we have to multiply with -1 # as we use a minimizer - actor_loss = -1 * actor_qvalues.mean() + actor_loss = -tmp_critic( + states, self._actor(states)).mean() actor_loss.backward() trainer_actor.step(self._minibatch_size) @@ -566,7 +566,7 @@ class DdpgAgent(Agent): episode_actor_loss +=\ np.sum(actor_loss.asnumpy()) / self._minibatch_size episode_avg_q_value +=\ - np.sum(actor_qvalues.asnumpy()) / self._minibatch_size + np.sum(qvalues.asnumpy()) / self._minibatch_size training_steps += 1 @@ -594,7 +594,6 @@ class DdpgAgent(Agent): self._strategy.cur_eps, episode_reward) self._do_snapshot_if_in_interval(self._current_episode) - self._strategy.decay(self._current_episode) if self._is_target_reached(avg_reward): self._logger.info( @@ -605,7 +604,7 @@ class DdpgAgent(Agent): self._evaluate() self.save_parameters(episode=self._current_episode) - self.save_best_network(os.path.join(self._output_directory, 'best')) + self.export_best_network() self._training_stats.save_stats(self._output_directory) self._logger.info('--------- Training finished ---------') return True @@ -641,6 +640,359 @@ class DdpgAgent(Agent): self._critic, self._state_dim, self._action_dim, ctx=self._ctx) +class TwinDelayedDdpgAgent(DdpgAgent): + def __init__( + self, + actor, + critic, + environment, + replay_memory_params, + strategy_params, + state_dim, + action_dim, + soft_target_update_rate=.001, + actor_optimizer='adam', + actor_optimizer_params={'learning_rate': 0.0001}, + critic_optimizer='adam', + critic_optimizer_params={'learning_rate': 0.001}, + ctx=None, + discount_factor=.9, + training_episodes=50, + start_training=20, + train_interval=1, + snapshot_interval=200, + agent_name='DdpgAgent', + max_episode_step=9999, + evaluation_samples=100, + output_directory='model_parameters', + verbose=True, + target_score=None, + policy_noise=0.2, + noise_clip=0.5, + policy_delay=2 + ): + super(TwinDelayedDdpgAgent, self).__init__( + environment=environment, replay_memory_params=replay_memory_params, + strategy_params=strategy_params, state_dim=state_dim, + action_dim=action_dim, ctx=ctx, discount_factor=discount_factor, + training_episodes=training_episodes, start_training=start_training, + train_interval=train_interval, + snapshot_interval=snapshot_interval, agent_name=agent_name, + max_episode_step=max_episode_step, + output_directory=output_directory, verbose=verbose, + target_score=target_score, evaluation_samples=evaluation_samples, + critic=critic, soft_target_update_rate=soft_target_update_rate, + actor=actor, actor_optimizer=actor_optimizer, + actor_optimizer_params=actor_optimizer_params, + critic_optimizer=critic_optimizer, + critic_optimizer_params=critic_optimizer_params) + + self._policy_noise = policy_noise + self._noise_clip = noise_clip + self._policy_delay = policy_delay + + self._critic2 = self._critic.__class__() + self._critic2.collect_params().initialize( + mx.init.Normal(), ctx=self._ctx) + self._critic2.hybridize() + self._critic2(nd.ones((1,) + state_dim, ctx=self._ctx), + nd.ones((1,) + action_dim, ctx=self._ctx)) + + self._critic2_target = self._copy_critic2() + + self._critic2_optimizer = critic_optimizer + self._critic2_optimizer_params = self._adjust_optimizer_params( + critic_optimizer_params) + + def _make_pickle_ready(self, session_dir): + super(TwinDelayedDdpgAgent, self)._make_pickle_ready(session_dir) + self._export_net(self._critic2, 'critic2', filedir=session_dir) + self._critic2 = None + self._export_net( + self._critic2_target, 'critic2_target', filedir=session_dir) + self._critic2_target = None + + @classmethod + def resume_from_session(cls, session_dir, actor, critic, environment): + import pickle + if not os.path.exists(session_dir): + raise ValueError('Session directory does not exist') + + files = dict() + files['agent'] = os.path.join(session_dir, 'agent.p') + files['best_net_params'] = os.path.join(session_dir, 'best_net.params') + files['actor_net_params'] = os.path.join(session_dir, 'actor.params') + files['actor_target_net_params'] = os.path.join( + session_dir, 'actor_target.params') + files['critic_net_params'] = os.path.join(session_dir, 'critic.params') + files['critic_target_net_params'] = os.path.join( + session_dir, 'critic_target.params') + files['critic2_net_params'] = os.path.join( + session_dir, 'critic2.params') + files['critic2_target_net_params'] = os.path.join( + session_dir, 'critic2_target.params') + + for file in files.values(): + if not os.path.exists(file): + raise ValueError( + 'Session directory is not complete: {} is missing' + .format(file)) + + with open(files['agent'], 'rb') as f: + agent = pickle.load(f) + + agent._environment = environment + + agent._actor = actor + agent._actor.load_parameters(files['actor_net_params'], agent._ctx) + agent._actor.hybridize() + agent._actor(nd.random_normal( + shape=((1,) + agent._state_dim), ctx=agent._ctx)) + + agent._best_net = copy_net(agent._actor, agent._state_dim, agent._ctx) + agent._best_net.load_parameters(files['best_net_params'], agent._ctx) + + agent._actor_target = copy_net( + agent._actor, agent._state_dim, agent._ctx) + agent._actor_target.load_parameters(files['actor_target_net_params']) + + agent._critic = critic + agent._critic.load_parameters(files['critic_net_params'], agent._ctx) + agent._critic.hybridize() + agent._critic( + nd.random_normal(shape=((1,) + agent._state_dim), ctx=agent._ctx), + nd.random_normal(shape=((1,) + agent._action_dim), ctx=agent._ctx)) + + agent._critic_target = copy_net_with_two_inputs( + agent._critic, agent._state_dim, agent._action_dim, agent._ctx) + agent._critic_target.load_parameters(files['critic_target_net_params']) + + agent._critic2 = copy_net_with_two_inputs( + agent._critic, agent._state_dim, agent._action_dim, agent._ctx) + agent._critic2.load_parameters(files['critic2_net_params'], agent._ctx) + agent._critic2.hybridize() + agent._critic2( + nd.random_normal(shape=((1,) + agent._state_dim), ctx=agent._ctx), + nd.random_normal(shape=((1,) + agent._action_dim), ctx=agent._ctx)) + + agent._critic2_target = copy_net_with_two_inputs( + agent._critic2, agent._state_dim, agent._action_dim, agent._ctx) + agent._critic2_target.load_parameters( + files['critic2_target_net_params']) + + agent._logger = ArchLogger.get_logger() + agent._training_stats.logger = ArchLogger.get_logger() + agent._logger.info('Agent was retrieved; Training can be continued') + + return agent + + def _copy_critic2(self): + assert self._critic2 is not None + assert self._ctx is not None + assert type(self._state_dim) is tuple + assert type(self._action_dim) is tuple + return copy_net_with_two_inputs( + self._critic2, self._state_dim, self._action_dim, ctx=self._ctx) + + def train(self, episodes=None): + self.save_config_file() + self._logger.info("--- Start TwinDelayedDDPG training ---") + episodes = \ + episodes if episodes is not None else self._training_episodes + + resume = (self._current_episode > 0) + if resume: + self._logger.info("Training session resumed") + self._logger.info( + "Starting from episode {}".format(self._current_episode)) + else: + self._training_stats = DdpgTrainingStats(episodes) + + # Initialize target Q1' and Q2' and mu' + self._actor_target = self._copy_actor() + self._critic_target = self._copy_critic() + self._critic2_target = self._copy_critic2() + + # Initialize l2 loss for critic network + l2_loss = gluon.loss.L2Loss() + + # Initialize critic and actor trainer + trainer_actor = gluon.Trainer( + self._actor.collect_params(), self._actor_optimizer, + self._actor_optimizer_params) + trainer_critic = gluon.Trainer( + self._critic.collect_params(), self._critic_optimizer, + self._critic_optimizer_params) + trainer_critic2 = gluon.Trainer( + self._critic2.collect_params(), self._critic2_optimizer, + self._critic2_optimizer_params) + + # For episode=1..n + while self._current_episode < episodes: + # Check interrupt flag + if self._check_interrupt_routine(): + return False + + # Initialize new episode + step = 0 + episode_reward = 0 + start = time.time() + episode_critic_loss = 0 + episode_actor_loss = 0 + episode_avg_q_value = 0 + training_steps = 0 + actor_updates = 0 + + # Get initialial observation state s + state = self._environment.reset() + + # For step=1..T + while step < self._max_episode_step: + # Select an action a = mu(s) + N(step) according to current + # actor and exploration noise N according to strategy + action = self._strategy.select_action( + self.get_next_action(state)) + self._strategy.decay(self._current_episode) + + # Execute action a and observe reward r and next state ns + next_state, reward, terminal, _ = \ + self._environment.step(action) + + self._logger.debug( + 'Applied action {} with reward {}'.format(action, reward)) + + # Store transition (s,a,r,ns) in replay buffer + self._memory.append( + state, action, reward, next_state, terminal) + + if self._do_training(): + # Sample random minibatch of b transitions + # (s_i, a_i, r_i, s_(i+1)) from replay buffer + states, actions, rewards, next_states, terminals =\ + self._sample_from_memory() + + clipped_noise = nd.array( + np.clip( + np.random.normal( + loc=0, scale=self._policy_noise, + size=self._minibatch_size + ).reshape(self._minibatch_size, 1), + -self._noise_clip, + self._noise_clip + ), + ctx=self._ctx + ) + target_action = np.clip( + self._actor_target(next_states) + clipped_noise, + self._strategy._action_low, + self._strategy._action_high) + + rewards = rewards.reshape(self._minibatch_size, 1) + terminals = terminals.reshape(self._minibatch_size, 1) + + target_qvalues1 = self._critic_target(next_states, + target_action) + target_qvalues2 = self._critic2_target(next_states, + target_action) + target_qvalues = nd.minimum(target_qvalues1, + target_qvalues2) + y = rewards + (1 - terminals) * self._discount_factor\ + * target_qvalues + + with autograd.record(): + qvalues1 = self._critic(states, actions) + critic1_loss = l2_loss(qvalues1, y) + critic1_loss.backward() + trainer_critic.step(self._minibatch_size) + + with autograd.record(): + qvalues2 = self._critic2(states, actions) + critic2_loss = l2_loss(qvalues2, y) + critic2_loss.backward() + trainer_critic2.step(self._minibatch_size) + + critic_loss = (critic1_loss.mean() + critic2_loss.mean())/2 + + if self._total_steps % self._policy_delay == 0: + tmp_critic = self._copy_critic() + with autograd.record(): + actor_loss = -tmp_critic( + states, self._actor(states)).mean() + actor_loss.backward() + trainer_actor.step(self._minibatch_size) + + # Update target networks: + self._actor_target = self._soft_update( + self._actor, self._actor_target, + self._soft_target_update_rate) + self._critic_target = self._soft_update( + self._critic, self._critic_target, + self._soft_target_update_rate) + self._critic2_target = self._soft_update( + self._critic2, self._critic2_target, + self._soft_target_update_rate) + + actor_updates = actor_updates + 1 + else: + actor_loss = nd.array([0], ctx=self._ctx) + + # Update statistics + episode_critic_loss +=\ + np.sum(critic_loss.asnumpy()) / self._minibatch_size + episode_actor_loss += 0 if actor_updates == 0 else\ + np.sum(actor_loss.asnumpy()[0]) + episode_avg_q_value +=\ + np.sum(target_qvalues.asnumpy()) / self._minibatch_size + + training_steps += 1 + + episode_reward += reward + step += 1 + self._total_steps += 1 + state = next_state + + if terminal: + # Reset the strategy + self._strategy.reset() + break + + # Log the episode results + episode_actor_loss = 0 if actor_updates == 0\ + else (episode_actor_loss / actor_updates) + episode_critic_loss = 0 if training_steps == 0\ + else (episode_critic_loss / training_steps) + episode_avg_q_value = 0 if actor_updates == 0\ + else (episode_avg_q_value / training_steps) + + avg_reward = self._training_stats.log_episode( + self._current_episode, start, training_steps, + episode_actor_loss, episode_critic_loss, episode_avg_q_value, + self._strategy.cur_eps, episode_reward) + + self._do_snapshot_if_in_interval(self._current_episode) + + if self._is_target_reached(avg_reward): + self._logger.info( + 'Target score is reached in average; Training is stopped') + break + + self._current_episode += 1 + + self._evaluate() + self.save_parameters(episode=self._current_episode) + self.export_best_network() + self._training_stats.save_stats(self._output_directory) + self._logger.info('--------- Training finished ---------') + return True + + def _make_config_dict(self): + config = super(TwinDelayedDdpgAgent, self)._make_config_dict() + config['policy_noise'] = self._policy_noise + config['noise_clip'] = self._noise_clip + config['policy_delay'] = self._policy_delay + return config + + class DqnAgent(Agent): def __init__( self, @@ -652,7 +1004,7 @@ class DqnAgent(Agent): action_dim, ctx=None, discount_factor=.9, - loss_function='euclidean', + loss_function='l2', optimizer='rmsprop', optimizer_params={'learning_rate': 0.09}, training_episodes=50, @@ -691,6 +1043,14 @@ class DqnAgent(Agent): self._double_dqn = double_dqn self._use_fix_target = use_fix_target + # Build memory buffer for discrete actions + replay_memory_params['state_dim'] = state_dim + replay_memory_params['action_dim'] = (1,) + self._replay_memory_params = replay_memory_params + rm_builder = ReplayMemoryBuilder() + self._memory = rm_builder.build_by_params(**replay_memory_params) + self._minibatch_size = self._memory.sample_size + # Initialize best network self._best_net = copy_net(self._qnet, self._state_dim, self._ctx) self._best_avg_score = -np.infty @@ -740,10 +1100,10 @@ class DqnAgent(Agent): def _make_pickle_ready(self, session_dir): super(DqnAgent, self)._make_pickle_ready(session_dir) - self._save_net(self._qnet, 'current_qnet') - self._save_net(self._qnet, 'qnet', session_dir) + self._export_net(self._qnet, 'current_qnet') + self._export_net(self._qnet, 'qnet', filedir=session_dir) self._qnet = None - self._save_net(self._target_qnet, 'target_net', session_dir) + self._export_net(self._target_qnet, 'target_net', filedir=session_dir) self._target_qnet = None def get_q_values(self, state, with_best=False): @@ -839,6 +1199,7 @@ class DqnAgent(Agent): # 1. Choose an action based on current game state and policy q_values = self._qnet(nd.array([state], ctx=self._ctx)) action = self._strategy.select_action(q_values[0]) + self._strategy.decay(self._current_episode) # 2. Play the game for a single step next_state, reward, terminal, _ =\ @@ -846,7 +1207,7 @@ class DqnAgent(Agent): # 3. Store transition in replay memory self._memory.append( - state, action, reward, next_state, terminal) + state, [action], reward, next_state, terminal) # 4. Train the network if in interval if self._do_training(): @@ -875,8 +1236,6 @@ class DqnAgent(Agent): self._current_episode, start, training_steps, episode_loss, self._strategy.cur_eps, episode_reward) - self._strategy.decay(self._current_episode) - if self._is_target_reached(avg_reward): self._logger.info( 'Target score is reached in average; Training is stopped') @@ -886,7 +1245,7 @@ class DqnAgent(Agent): self._evaluate() self.save_parameters(episode=self._current_episode) - self.save_best_network(os.path.join(self._output_directory, 'best')) + self.export_best_network() self._training_stats.save_stats(self._output_directory) self._logger.info('--------- Training finished ---------') return True @@ -902,7 +1261,8 @@ class DqnAgent(Agent): return config def save_parameters(self, episode): - self._save_parameters(self._qnet, episode=episode) + self._export_net( + self._qnet, self._agent_name + '_qnet', episode=episode) def _save_current_as_best_net(self): self._best_net = copy_net( diff --git a/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/strategy.py b/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/strategy.py index 8e4c279e5569d5b1505ee4fb9f6e9136a40c30bb..d2d2386658c56c5ebb6d654468669a2664622305 100644 --- a/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/strategy.py +++ b/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/strategy.py @@ -13,18 +13,21 @@ class StrategyBuilder(object): epsilon_decay_method='no', epsilon_decay=0.0, epsilon_decay_start=0, + epsilon_decay_per_step=False, action_dim=None, action_low=None, action_high=None, mu=0.0, theta=0.5, - sigma=0.3 + sigma=0.3, + noise_variance=0.1 ): if epsilon_decay_method == 'linear': decay = LinearDecay( eps_decay=epsilon_decay, min_eps=min_epsilon, - decay_start=epsilon_decay_start) + decay_start=epsilon_decay_start, + decay_per_step=epsilon_decay_per_step) else: decay = NoDecay() @@ -44,6 +47,13 @@ class StrategyBuilder(object): return OrnsteinUhlenbeckStrategy( action_dim, action_low, action_high, epsilon, mu, theta, sigma, decay) + elif method == 'gaussian': + assert action_dim is not None + assert action_low is not None + assert action_high is not None + assert noise_variance is not None + return GaussianNoiseStrategy(action_dim, action_low, action_high, + epsilon, noise_variance, decay) else: assert action_dim is not None assert len(action_dim) == 1 @@ -70,17 +80,27 @@ class NoDecay(BaseDecay): class LinearDecay(BaseDecay): - def __init__(self, eps_decay, min_eps=0, decay_start=0): + def __init__(self, eps_decay, min_eps=0, decay_start=0, decay_per_step=False): super(LinearDecay, self).__init__() self.eps_decay = eps_decay self.min_eps = min_eps self.decay_start = decay_start + self.decay_per_step = decay_per_step + self.last_episode = -1 - def decay(self, cur_eps, episode): - if episode < self.decay_start: - return cur_eps + def do_decay(self, episode): + if self.decay_per_step: + do = (episode >= self.decay_start) else: + do = ((self.last_episode != episode) and (episode >= self.decay_start)) + self.last_episode = episode + return do + + def decay(self, cur_eps, episode): + if self.do_decay(episode): return max(cur_eps - self.eps_decay, self.min_eps) + else: + return cur_eps class BaseStrategy(object): @@ -170,3 +190,29 @@ class OrnsteinUhlenbeckStrategy(BaseStrategy): noise = self._evolve_state() action = (1.0 - self.cur_eps) * values + (self.cur_eps * noise) return np.clip(action, self._action_low, self._action_high) + + +class GaussianNoiseStrategy(BaseStrategy): + def __init__( + self, + action_dim, + action_low, + action_high, + eps, + noise_variance, + decay=NoDecay() + ): + super(GaussianNoiseStrategy, self).__init__(decay) + self.eps = eps + self.cur_eps = eps + + self._action_dim = action_dim + self._action_low = action_low + self._action_high = action_high + + self._noise_variance = noise_variance + + def select_action(self, values): + noise = np.random.normal(loc=0.0, scale=self._noise_variance, size=self._action_dim) + action = values + self.cur_eps * noise + return np.clip(action, self._action_low, self._action_high) diff --git a/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/util.py b/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/util.py index 78e7795fd290f99980c689ad1c76f274a821f830..1d8b38c4a32a1cc6c0e364054e746e9115d7d05b 100644 --- a/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/util.py +++ b/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/util.py @@ -11,8 +11,8 @@ import cnnarch_logger LOSS_FUNCTIONS = { 'l1': gluon.loss.L1Loss(), - 'euclidean': gluon.loss.L2Loss(), - 'huber_loss': gluon.loss.HuberLoss(), + 'l2': gluon.loss.L2Loss(), + 'huber': gluon.loss.HuberLoss(), 'softmax_cross_entropy': gluon.loss.SoftmaxCrossEntropyLoss(), 'sigmoid_cross_entropy': gluon.loss.SigmoidBinaryCrossEntropyLoss()} diff --git a/src/test/resources/target_code/ReinforcementConfig3/CNNTrainer_reinforcementConfig3.py b/src/test/resources/target_code/ReinforcementConfig3/CNNTrainer_reinforcementConfig3.py index e59d9b1e62fb757f9c8fa904080054db5a7c0bfd..6af8211440e03e79ea95cad97bd21eb00a8718ac 100644 --- a/src/test/resources/target_code/ReinforcementConfig3/CNNTrainer_reinforcementConfig3.py +++ b/src/test/resources/target_code/ReinforcementConfig3/CNNTrainer_reinforcementConfig3.py @@ -63,7 +63,7 @@ if __name__ == "__main__": 'memory_size': 1000000, 'sample_size': 64, 'state_dtype': 'float32', - 'action_dtype': 'float32', + 'action_dtype': 'uint8', 'rewards_dtype': 'float32' }, 'strategy_params': { @@ -84,10 +84,10 @@ if __name__ == "__main__": 'snapshot_interval': 500, 'max_episode_step': 10000, 'target_score': 35000, - 'qnet':qnet_creator.net, + 'qnet':qnet_creator.networks[0], 'use_fix_target': True, 'target_update_interval': 500, - 'loss': 'huber', + 'loss_function': 'huber', 'optimizer': 'adam', 'optimizer_params': { 'learning_rate': 0.001 }, @@ -114,4 +114,4 @@ if __name__ == "__main__": train_successful = agent.train() if train_successful: - agent.save_best_network(qnet_creator._model_dir_ + qnet_creator._model_prefix_ + '_0_newest', epoch=0) + agent.export_best_network(path=qnet_creator._model_dir_ + qnet_creator._model_prefix_ + '_0_newest', epoch=0) diff --git a/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/agent.py b/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/agent.py index 6dbf9ef17065c25832844649b1c3bb16009f4e6f..148c89848b8d990b5f635c3976fd15e71a608893 100644 --- a/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/agent.py +++ b/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/agent.py @@ -124,10 +124,10 @@ class Agent(object): def _make_pickle_ready(self, session_dir): del self._training_stats.logger - self._logger = None self._environment.close() self._environment = None - self._save_net(self._best_net, 'best_net', session_dir) + self._export_net(self._best_net, 'best_net', filedir=session_dir) + self._logger = None self._best_net = None def _make_config_dict(self): @@ -258,26 +258,22 @@ class Agent(object): return self._target_score is not None\ and avg_reward > self._target_score - def _save_parameters(self, net, episode=None, filename='dqn-agent-params'): + def _export_net(self, net, filename, filedir=None, episode=None): assert self._output_directory assert isinstance(net, gluon.HybridBlock) make_directory_if_not_exist(self._output_directory) + filedir = self._output_directory if filedir is None else filedir + filename = os.path.join(filedir, filename) - if(episode is not None): - self._logger.info( - 'Saving model parameters after episode %d' % episode) + if episode is not None: filename = filename + '-ep{}'.format(episode) - else: - self._logger.info('Saving model parameters') - self._save_net(net, filename) - def _save_net(self, net, filename, filedir=None): - filedir = self._output_directory if filedir is None else filedir - filename = os.path.join(filedir, filename) - net.save_parameters(filename + '.params') net.export(filename, epoch=0) + net.save_parameters(filename + '.params') - def save_best_network(self, path, epoch=0): + def export_best_network(self, path=None, epoch=0): + path = os.path.join(self._output_directory, 'best_network')\ + if path is None else path self._logger.info( 'Saving best network with average reward of {}'.format( self._best_avg_score)) @@ -373,15 +369,17 @@ class DdpgAgent(Agent): def _make_pickle_ready(self, session_dir): super(DdpgAgent, self)._make_pickle_ready(session_dir) - self._save_net(self._actor, 'current_actor') + self._export_net(self._actor, 'current_actor') - self._save_net(self._actor, 'actor', session_dir) + self._export_net(self._actor, 'actor', filedir=session_dir) self._actor = None - self._save_net(self._critic, 'critic', session_dir) + self._export_net(self._critic, 'critic', filedir=session_dir) self._critic = None - self._save_net(self._actor_target, 'actor_target', session_dir) + self._export_net( + self._actor_target, 'actor_target', filedir=session_dir) self._actor_target = None - self._save_net(self._critic_target, 'critic_target', session_dir) + self._export_net( + self._critic_target, 'critic_target', filedir=session_dir) self._critic_target = None @classmethod @@ -449,7 +447,8 @@ class DdpgAgent(Agent): return action[0].asnumpy() def save_parameters(self, episode): - self._save_parameters(self._actor, episode=episode) + self._export_net( + self._actor, self._agent_name + '_actor', episode=episode) def train(self, episodes=None): self.save_config_file() @@ -504,6 +503,7 @@ class DdpgAgent(Agent): # actor and exploration noise N according to strategy action = self._strategy.select_action( self.get_next_action(state)) + self._strategy.decay(self._current_episode) # Execute action a and observe reward r and next state ns next_state, reward, terminal, _ = \ @@ -545,10 +545,10 @@ class DdpgAgent(Agent): # with critic parameters tmp_critic = self._copy_critic() with autograd.record(): - actor_qvalues = tmp_critic(states, self._actor(states)) # For maximizing qvalues we have to multiply with -1 # as we use a minimizer - actor_loss = -1 * actor_qvalues.mean() + actor_loss = -tmp_critic( + states, self._actor(states)).mean() actor_loss.backward() trainer_actor.step(self._minibatch_size) @@ -566,7 +566,7 @@ class DdpgAgent(Agent): episode_actor_loss +=\ np.sum(actor_loss.asnumpy()) / self._minibatch_size episode_avg_q_value +=\ - np.sum(actor_qvalues.asnumpy()) / self._minibatch_size + np.sum(qvalues.asnumpy()) / self._minibatch_size training_steps += 1 @@ -594,7 +594,6 @@ class DdpgAgent(Agent): self._strategy.cur_eps, episode_reward) self._do_snapshot_if_in_interval(self._current_episode) - self._strategy.decay(self._current_episode) if self._is_target_reached(avg_reward): self._logger.info( @@ -605,7 +604,7 @@ class DdpgAgent(Agent): self._evaluate() self.save_parameters(episode=self._current_episode) - self.save_best_network(os.path.join(self._output_directory, 'best')) + self.export_best_network() self._training_stats.save_stats(self._output_directory) self._logger.info('--------- Training finished ---------') return True @@ -641,6 +640,359 @@ class DdpgAgent(Agent): self._critic, self._state_dim, self._action_dim, ctx=self._ctx) +class TwinDelayedDdpgAgent(DdpgAgent): + def __init__( + self, + actor, + critic, + environment, + replay_memory_params, + strategy_params, + state_dim, + action_dim, + soft_target_update_rate=.001, + actor_optimizer='adam', + actor_optimizer_params={'learning_rate': 0.0001}, + critic_optimizer='adam', + critic_optimizer_params={'learning_rate': 0.001}, + ctx=None, + discount_factor=.9, + training_episodes=50, + start_training=20, + train_interval=1, + snapshot_interval=200, + agent_name='DdpgAgent', + max_episode_step=9999, + evaluation_samples=100, + output_directory='model_parameters', + verbose=True, + target_score=None, + policy_noise=0.2, + noise_clip=0.5, + policy_delay=2 + ): + super(TwinDelayedDdpgAgent, self).__init__( + environment=environment, replay_memory_params=replay_memory_params, + strategy_params=strategy_params, state_dim=state_dim, + action_dim=action_dim, ctx=ctx, discount_factor=discount_factor, + training_episodes=training_episodes, start_training=start_training, + train_interval=train_interval, + snapshot_interval=snapshot_interval, agent_name=agent_name, + max_episode_step=max_episode_step, + output_directory=output_directory, verbose=verbose, + target_score=target_score, evaluation_samples=evaluation_samples, + critic=critic, soft_target_update_rate=soft_target_update_rate, + actor=actor, actor_optimizer=actor_optimizer, + actor_optimizer_params=actor_optimizer_params, + critic_optimizer=critic_optimizer, + critic_optimizer_params=critic_optimizer_params) + + self._policy_noise = policy_noise + self._noise_clip = noise_clip + self._policy_delay = policy_delay + + self._critic2 = self._critic.__class__() + self._critic2.collect_params().initialize( + mx.init.Normal(), ctx=self._ctx) + self._critic2.hybridize() + self._critic2(nd.ones((1,) + state_dim, ctx=self._ctx), + nd.ones((1,) + action_dim, ctx=self._ctx)) + + self._critic2_target = self._copy_critic2() + + self._critic2_optimizer = critic_optimizer + self._critic2_optimizer_params = self._adjust_optimizer_params( + critic_optimizer_params) + + def _make_pickle_ready(self, session_dir): + super(TwinDelayedDdpgAgent, self)._make_pickle_ready(session_dir) + self._export_net(self._critic2, 'critic2', filedir=session_dir) + self._critic2 = None + self._export_net( + self._critic2_target, 'critic2_target', filedir=session_dir) + self._critic2_target = None + + @classmethod + def resume_from_session(cls, session_dir, actor, critic, environment): + import pickle + if not os.path.exists(session_dir): + raise ValueError('Session directory does not exist') + + files = dict() + files['agent'] = os.path.join(session_dir, 'agent.p') + files['best_net_params'] = os.path.join(session_dir, 'best_net.params') + files['actor_net_params'] = os.path.join(session_dir, 'actor.params') + files['actor_target_net_params'] = os.path.join( + session_dir, 'actor_target.params') + files['critic_net_params'] = os.path.join(session_dir, 'critic.params') + files['critic_target_net_params'] = os.path.join( + session_dir, 'critic_target.params') + files['critic2_net_params'] = os.path.join( + session_dir, 'critic2.params') + files['critic2_target_net_params'] = os.path.join( + session_dir, 'critic2_target.params') + + for file in files.values(): + if not os.path.exists(file): + raise ValueError( + 'Session directory is not complete: {} is missing' + .format(file)) + + with open(files['agent'], 'rb') as f: + agent = pickle.load(f) + + agent._environment = environment + + agent._actor = actor + agent._actor.load_parameters(files['actor_net_params'], agent._ctx) + agent._actor.hybridize() + agent._actor(nd.random_normal( + shape=((1,) + agent._state_dim), ctx=agent._ctx)) + + agent._best_net = copy_net(agent._actor, agent._state_dim, agent._ctx) + agent._best_net.load_parameters(files['best_net_params'], agent._ctx) + + agent._actor_target = copy_net( + agent._actor, agent._state_dim, agent._ctx) + agent._actor_target.load_parameters(files['actor_target_net_params']) + + agent._critic = critic + agent._critic.load_parameters(files['critic_net_params'], agent._ctx) + agent._critic.hybridize() + agent._critic( + nd.random_normal(shape=((1,) + agent._state_dim), ctx=agent._ctx), + nd.random_normal(shape=((1,) + agent._action_dim), ctx=agent._ctx)) + + agent._critic_target = copy_net_with_two_inputs( + agent._critic, agent._state_dim, agent._action_dim, agent._ctx) + agent._critic_target.load_parameters(files['critic_target_net_params']) + + agent._critic2 = copy_net_with_two_inputs( + agent._critic, agent._state_dim, agent._action_dim, agent._ctx) + agent._critic2.load_parameters(files['critic2_net_params'], agent._ctx) + agent._critic2.hybridize() + agent._critic2( + nd.random_normal(shape=((1,) + agent._state_dim), ctx=agent._ctx), + nd.random_normal(shape=((1,) + agent._action_dim), ctx=agent._ctx)) + + agent._critic2_target = copy_net_with_two_inputs( + agent._critic2, agent._state_dim, agent._action_dim, agent._ctx) + agent._critic2_target.load_parameters( + files['critic2_target_net_params']) + + agent._logger = ArchLogger.get_logger() + agent._training_stats.logger = ArchLogger.get_logger() + agent._logger.info('Agent was retrieved; Training can be continued') + + return agent + + def _copy_critic2(self): + assert self._critic2 is not None + assert self._ctx is not None + assert type(self._state_dim) is tuple + assert type(self._action_dim) is tuple + return copy_net_with_two_inputs( + self._critic2, self._state_dim, self._action_dim, ctx=self._ctx) + + def train(self, episodes=None): + self.save_config_file() + self._logger.info("--- Start TwinDelayedDDPG training ---") + episodes = \ + episodes if episodes is not None else self._training_episodes + + resume = (self._current_episode > 0) + if resume: + self._logger.info("Training session resumed") + self._logger.info( + "Starting from episode {}".format(self._current_episode)) + else: + self._training_stats = DdpgTrainingStats(episodes) + + # Initialize target Q1' and Q2' and mu' + self._actor_target = self._copy_actor() + self._critic_target = self._copy_critic() + self._critic2_target = self._copy_critic2() + + # Initialize l2 loss for critic network + l2_loss = gluon.loss.L2Loss() + + # Initialize critic and actor trainer + trainer_actor = gluon.Trainer( + self._actor.collect_params(), self._actor_optimizer, + self._actor_optimizer_params) + trainer_critic = gluon.Trainer( + self._critic.collect_params(), self._critic_optimizer, + self._critic_optimizer_params) + trainer_critic2 = gluon.Trainer( + self._critic2.collect_params(), self._critic2_optimizer, + self._critic2_optimizer_params) + + # For episode=1..n + while self._current_episode < episodes: + # Check interrupt flag + if self._check_interrupt_routine(): + return False + + # Initialize new episode + step = 0 + episode_reward = 0 + start = time.time() + episode_critic_loss = 0 + episode_actor_loss = 0 + episode_avg_q_value = 0 + training_steps = 0 + actor_updates = 0 + + # Get initialial observation state s + state = self._environment.reset() + + # For step=1..T + while step < self._max_episode_step: + # Select an action a = mu(s) + N(step) according to current + # actor and exploration noise N according to strategy + action = self._strategy.select_action( + self.get_next_action(state)) + self._strategy.decay(self._current_episode) + + # Execute action a and observe reward r and next state ns + next_state, reward, terminal, _ = \ + self._environment.step(action) + + self._logger.debug( + 'Applied action {} with reward {}'.format(action, reward)) + + # Store transition (s,a,r,ns) in replay buffer + self._memory.append( + state, action, reward, next_state, terminal) + + if self._do_training(): + # Sample random minibatch of b transitions + # (s_i, a_i, r_i, s_(i+1)) from replay buffer + states, actions, rewards, next_states, terminals =\ + self._sample_from_memory() + + clipped_noise = nd.array( + np.clip( + np.random.normal( + loc=0, scale=self._policy_noise, + size=self._minibatch_size + ).reshape(self._minibatch_size, 1), + -self._noise_clip, + self._noise_clip + ), + ctx=self._ctx + ) + target_action = np.clip( + self._actor_target(next_states) + clipped_noise, + self._strategy._action_low, + self._strategy._action_high) + + rewards = rewards.reshape(self._minibatch_size, 1) + terminals = terminals.reshape(self._minibatch_size, 1) + + target_qvalues1 = self._critic_target(next_states, + target_action) + target_qvalues2 = self._critic2_target(next_states, + target_action) + target_qvalues = nd.minimum(target_qvalues1, + target_qvalues2) + y = rewards + (1 - terminals) * self._discount_factor\ + * target_qvalues + + with autograd.record(): + qvalues1 = self._critic(states, actions) + critic1_loss = l2_loss(qvalues1, y) + critic1_loss.backward() + trainer_critic.step(self._minibatch_size) + + with autograd.record(): + qvalues2 = self._critic2(states, actions) + critic2_loss = l2_loss(qvalues2, y) + critic2_loss.backward() + trainer_critic2.step(self._minibatch_size) + + critic_loss = (critic1_loss.mean() + critic2_loss.mean())/2 + + if self._total_steps % self._policy_delay == 0: + tmp_critic = self._copy_critic() + with autograd.record(): + actor_loss = -tmp_critic( + states, self._actor(states)).mean() + actor_loss.backward() + trainer_actor.step(self._minibatch_size) + + # Update target networks: + self._actor_target = self._soft_update( + self._actor, self._actor_target, + self._soft_target_update_rate) + self._critic_target = self._soft_update( + self._critic, self._critic_target, + self._soft_target_update_rate) + self._critic2_target = self._soft_update( + self._critic2, self._critic2_target, + self._soft_target_update_rate) + + actor_updates = actor_updates + 1 + else: + actor_loss = nd.array([0], ctx=self._ctx) + + # Update statistics + episode_critic_loss +=\ + np.sum(critic_loss.asnumpy()) / self._minibatch_size + episode_actor_loss += 0 if actor_updates == 0 else\ + np.sum(actor_loss.asnumpy()[0]) + episode_avg_q_value +=\ + np.sum(target_qvalues.asnumpy()) / self._minibatch_size + + training_steps += 1 + + episode_reward += reward + step += 1 + self._total_steps += 1 + state = next_state + + if terminal: + # Reset the strategy + self._strategy.reset() + break + + # Log the episode results + episode_actor_loss = 0 if actor_updates == 0\ + else (episode_actor_loss / actor_updates) + episode_critic_loss = 0 if training_steps == 0\ + else (episode_critic_loss / training_steps) + episode_avg_q_value = 0 if actor_updates == 0\ + else (episode_avg_q_value / training_steps) + + avg_reward = self._training_stats.log_episode( + self._current_episode, start, training_steps, + episode_actor_loss, episode_critic_loss, episode_avg_q_value, + self._strategy.cur_eps, episode_reward) + + self._do_snapshot_if_in_interval(self._current_episode) + + if self._is_target_reached(avg_reward): + self._logger.info( + 'Target score is reached in average; Training is stopped') + break + + self._current_episode += 1 + + self._evaluate() + self.save_parameters(episode=self._current_episode) + self.export_best_network() + self._training_stats.save_stats(self._output_directory) + self._logger.info('--------- Training finished ---------') + return True + + def _make_config_dict(self): + config = super(TwinDelayedDdpgAgent, self)._make_config_dict() + config['policy_noise'] = self._policy_noise + config['noise_clip'] = self._noise_clip + config['policy_delay'] = self._policy_delay + return config + + class DqnAgent(Agent): def __init__( self, @@ -652,7 +1004,7 @@ class DqnAgent(Agent): action_dim, ctx=None, discount_factor=.9, - loss_function='euclidean', + loss_function='l2', optimizer='rmsprop', optimizer_params={'learning_rate': 0.09}, training_episodes=50, @@ -691,6 +1043,14 @@ class DqnAgent(Agent): self._double_dqn = double_dqn self._use_fix_target = use_fix_target + # Build memory buffer for discrete actions + replay_memory_params['state_dim'] = state_dim + replay_memory_params['action_dim'] = (1,) + self._replay_memory_params = replay_memory_params + rm_builder = ReplayMemoryBuilder() + self._memory = rm_builder.build_by_params(**replay_memory_params) + self._minibatch_size = self._memory.sample_size + # Initialize best network self._best_net = copy_net(self._qnet, self._state_dim, self._ctx) self._best_avg_score = -np.infty @@ -740,10 +1100,10 @@ class DqnAgent(Agent): def _make_pickle_ready(self, session_dir): super(DqnAgent, self)._make_pickle_ready(session_dir) - self._save_net(self._qnet, 'current_qnet') - self._save_net(self._qnet, 'qnet', session_dir) + self._export_net(self._qnet, 'current_qnet') + self._export_net(self._qnet, 'qnet', filedir=session_dir) self._qnet = None - self._save_net(self._target_qnet, 'target_net', session_dir) + self._export_net(self._target_qnet, 'target_net', filedir=session_dir) self._target_qnet = None def get_q_values(self, state, with_best=False): @@ -839,6 +1199,7 @@ class DqnAgent(Agent): # 1. Choose an action based on current game state and policy q_values = self._qnet(nd.array([state], ctx=self._ctx)) action = self._strategy.select_action(q_values[0]) + self._strategy.decay(self._current_episode) # 2. Play the game for a single step next_state, reward, terminal, _ =\ @@ -846,7 +1207,7 @@ class DqnAgent(Agent): # 3. Store transition in replay memory self._memory.append( - state, action, reward, next_state, terminal) + state, [action], reward, next_state, terminal) # 4. Train the network if in interval if self._do_training(): @@ -875,8 +1236,6 @@ class DqnAgent(Agent): self._current_episode, start, training_steps, episode_loss, self._strategy.cur_eps, episode_reward) - self._strategy.decay(self._current_episode) - if self._is_target_reached(avg_reward): self._logger.info( 'Target score is reached in average; Training is stopped') @@ -886,7 +1245,7 @@ class DqnAgent(Agent): self._evaluate() self.save_parameters(episode=self._current_episode) - self.save_best_network(os.path.join(self._output_directory, 'best')) + self.export_best_network() self._training_stats.save_stats(self._output_directory) self._logger.info('--------- Training finished ---------') return True @@ -902,7 +1261,8 @@ class DqnAgent(Agent): return config def save_parameters(self, episode): - self._save_parameters(self._qnet, episode=episode) + self._export_net( + self._qnet, self._agent_name + '_qnet', episode=episode) def _save_current_as_best_net(self): self._best_net = copy_net( diff --git a/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/environment.py b/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/environment.py index 0abebd14e8baa9d53c6d27fe8aa6c0b3accde212..1cdbcf6c69e2b86103166f1d55420c60381a3c47 100644 --- a/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/environment.py +++ b/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/environment.py @@ -72,7 +72,6 @@ class RosEnvironment(Environment): def reset(self): self.__in_reset = True - time.sleep(0.5) reset_message = Bool() reset_message.data = True self.__waiting_for_state_update = True @@ -99,7 +98,8 @@ class RosEnvironment(Environment): next_state = self.__last_received_state terminal = self.__last_received_terminal reward = self.__last_received_reward - rospy.logdebug('Calculated reward: {}'.format(reward)) + + logger.debug('Transition: ({}, {}, {}, {})'.format(action, reward, next_state, terminal)) return next_state, reward, terminal, 0 @@ -118,23 +118,22 @@ class RosEnvironment(Environment): else: rospy.logerr("Timeout 3 times in a row: Terminate application") exit() - time.sleep(100/1000) + time.sleep(1/500) def close(self): rospy.signal_shutdown('Program ended!') def __state_callback(self, data): self.__last_received_state = np.array(data.data, dtype='float32').reshape((8,)) - rospy.logdebug('Received state: {}'.format(self.__last_received_state)) + logger.debug('Received state: {}'.format(self.__last_received_state)) self.__waiting_for_state_update = False def __terminal_state_callback(self, data): - self.__last_received_terminal = data.data - rospy.logdebug('Received terminal flag: {}'.format(self.__last_received_terminal)) - logger.debug('Received terminal: {}'.format(self.__last_received_terminal)) + self.__last_received_terminal = np.bool(data.data) + logger.debug('Received terminal flag: {}'.format(self.__last_received_terminal)) self.__waiting_for_terminal_update = False def __reward_callback(self, data): - self.__last_received_reward = float(data.data) + self.__last_received_reward = np.float32(data.data) logger.debug('Received reward: {}'.format(self.__last_received_reward)) self.__waiting_for_reward_update = False diff --git a/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/strategy.py b/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/strategy.py index 8e4c279e5569d5b1505ee4fb9f6e9136a40c30bb..d2d2386658c56c5ebb6d654468669a2664622305 100644 --- a/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/strategy.py +++ b/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/strategy.py @@ -13,18 +13,21 @@ class StrategyBuilder(object): epsilon_decay_method='no', epsilon_decay=0.0, epsilon_decay_start=0, + epsilon_decay_per_step=False, action_dim=None, action_low=None, action_high=None, mu=0.0, theta=0.5, - sigma=0.3 + sigma=0.3, + noise_variance=0.1 ): if epsilon_decay_method == 'linear': decay = LinearDecay( eps_decay=epsilon_decay, min_eps=min_epsilon, - decay_start=epsilon_decay_start) + decay_start=epsilon_decay_start, + decay_per_step=epsilon_decay_per_step) else: decay = NoDecay() @@ -44,6 +47,13 @@ class StrategyBuilder(object): return OrnsteinUhlenbeckStrategy( action_dim, action_low, action_high, epsilon, mu, theta, sigma, decay) + elif method == 'gaussian': + assert action_dim is not None + assert action_low is not None + assert action_high is not None + assert noise_variance is not None + return GaussianNoiseStrategy(action_dim, action_low, action_high, + epsilon, noise_variance, decay) else: assert action_dim is not None assert len(action_dim) == 1 @@ -70,17 +80,27 @@ class NoDecay(BaseDecay): class LinearDecay(BaseDecay): - def __init__(self, eps_decay, min_eps=0, decay_start=0): + def __init__(self, eps_decay, min_eps=0, decay_start=0, decay_per_step=False): super(LinearDecay, self).__init__() self.eps_decay = eps_decay self.min_eps = min_eps self.decay_start = decay_start + self.decay_per_step = decay_per_step + self.last_episode = -1 - def decay(self, cur_eps, episode): - if episode < self.decay_start: - return cur_eps + def do_decay(self, episode): + if self.decay_per_step: + do = (episode >= self.decay_start) else: + do = ((self.last_episode != episode) and (episode >= self.decay_start)) + self.last_episode = episode + return do + + def decay(self, cur_eps, episode): + if self.do_decay(episode): return max(cur_eps - self.eps_decay, self.min_eps) + else: + return cur_eps class BaseStrategy(object): @@ -170,3 +190,29 @@ class OrnsteinUhlenbeckStrategy(BaseStrategy): noise = self._evolve_state() action = (1.0 - self.cur_eps) * values + (self.cur_eps * noise) return np.clip(action, self._action_low, self._action_high) + + +class GaussianNoiseStrategy(BaseStrategy): + def __init__( + self, + action_dim, + action_low, + action_high, + eps, + noise_variance, + decay=NoDecay() + ): + super(GaussianNoiseStrategy, self).__init__(decay) + self.eps = eps + self.cur_eps = eps + + self._action_dim = action_dim + self._action_low = action_low + self._action_high = action_high + + self._noise_variance = noise_variance + + def select_action(self, values): + noise = np.random.normal(loc=0.0, scale=self._noise_variance, size=self._action_dim) + action = values + self.cur_eps * noise + return np.clip(action, self._action_low, self._action_high) diff --git a/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/util.py b/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/util.py index 78e7795fd290f99980c689ad1c76f274a821f830..1d8b38c4a32a1cc6c0e364054e746e9115d7d05b 100644 --- a/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/util.py +++ b/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/util.py @@ -11,8 +11,8 @@ import cnnarch_logger LOSS_FUNCTIONS = { 'l1': gluon.loss.L1Loss(), - 'euclidean': gluon.loss.L2Loss(), - 'huber_loss': gluon.loss.HuberLoss(), + 'l2': gluon.loss.L2Loss(), + 'huber': gluon.loss.HuberLoss(), 'softmax_cross_entropy': gluon.loss.SoftmaxCrossEntropyLoss(), 'sigmoid_cross_entropy': gluon.loss.SigmoidBinaryCrossEntropyLoss()} diff --git a/src/test/resources/target_code/ddpg/CNNTrainer_actorNetwork.py b/src/test/resources/target_code/ddpg/CNNTrainer_actorNetwork.py index 3459810d188cee5194a02b5c1053f90043cfd065..b09af8edee0582e4e5e19ae80621e0bbec87cd4c 100644 --- a/src/test/resources/target_code/ddpg/CNNTrainer_actorNetwork.py +++ b/src/test/resources/target_code/ddpg/CNNTrainer_actorNetwork.py @@ -88,8 +88,8 @@ if __name__ == "__main__": 'max_episode_step': 250, 'evaluation_samples': 100, 'target_score': 185.5, - 'actor': actor_creator.net, - 'critic': critic_creator.net, + 'actor': actor_creator.networks[0], + 'critic': critic_creator.networks[0], 'soft_target_update_rate': 0.001, 'actor_optimizer': 'adam', 'actor_optimizer_params': { @@ -133,4 +133,4 @@ if __name__ == "__main__": train_successful = agent.train() if train_successful: - agent.save_best_network(actor_creator._model_dir_ + actor_creator._model_prefix_ + '_0_newest', epoch=0) + agent.export_best_network(path=actor_creator._model_dir_ + actor_creator._model_prefix_ + '_0_newest', epoch=0) diff --git a/src/test/resources/target_code/ddpg/reinforcement_learning/agent.py b/src/test/resources/target_code/ddpg/reinforcement_learning/agent.py index 6dbf9ef17065c25832844649b1c3bb16009f4e6f..148c89848b8d990b5f635c3976fd15e71a608893 100644 --- a/src/test/resources/target_code/ddpg/reinforcement_learning/agent.py +++ b/src/test/resources/target_code/ddpg/reinforcement_learning/agent.py @@ -124,10 +124,10 @@ class Agent(object): def _make_pickle_ready(self, session_dir): del self._training_stats.logger - self._logger = None self._environment.close() self._environment = None - self._save_net(self._best_net, 'best_net', session_dir) + self._export_net(self._best_net, 'best_net', filedir=session_dir) + self._logger = None self._best_net = None def _make_config_dict(self): @@ -258,26 +258,22 @@ class Agent(object): return self._target_score is not None\ and avg_reward > self._target_score - def _save_parameters(self, net, episode=None, filename='dqn-agent-params'): + def _export_net(self, net, filename, filedir=None, episode=None): assert self._output_directory assert isinstance(net, gluon.HybridBlock) make_directory_if_not_exist(self._output_directory) + filedir = self._output_directory if filedir is None else filedir + filename = os.path.join(filedir, filename) - if(episode is not None): - self._logger.info( - 'Saving model parameters after episode %d' % episode) + if episode is not None: filename = filename + '-ep{}'.format(episode) - else: - self._logger.info('Saving model parameters') - self._save_net(net, filename) - def _save_net(self, net, filename, filedir=None): - filedir = self._output_directory if filedir is None else filedir - filename = os.path.join(filedir, filename) - net.save_parameters(filename + '.params') net.export(filename, epoch=0) + net.save_parameters(filename + '.params') - def save_best_network(self, path, epoch=0): + def export_best_network(self, path=None, epoch=0): + path = os.path.join(self._output_directory, 'best_network')\ + if path is None else path self._logger.info( 'Saving best network with average reward of {}'.format( self._best_avg_score)) @@ -373,15 +369,17 @@ class DdpgAgent(Agent): def _make_pickle_ready(self, session_dir): super(DdpgAgent, self)._make_pickle_ready(session_dir) - self._save_net(self._actor, 'current_actor') + self._export_net(self._actor, 'current_actor') - self._save_net(self._actor, 'actor', session_dir) + self._export_net(self._actor, 'actor', filedir=session_dir) self._actor = None - self._save_net(self._critic, 'critic', session_dir) + self._export_net(self._critic, 'critic', filedir=session_dir) self._critic = None - self._save_net(self._actor_target, 'actor_target', session_dir) + self._export_net( + self._actor_target, 'actor_target', filedir=session_dir) self._actor_target = None - self._save_net(self._critic_target, 'critic_target', session_dir) + self._export_net( + self._critic_target, 'critic_target', filedir=session_dir) self._critic_target = None @classmethod @@ -449,7 +447,8 @@ class DdpgAgent(Agent): return action[0].asnumpy() def save_parameters(self, episode): - self._save_parameters(self._actor, episode=episode) + self._export_net( + self._actor, self._agent_name + '_actor', episode=episode) def train(self, episodes=None): self.save_config_file() @@ -504,6 +503,7 @@ class DdpgAgent(Agent): # actor and exploration noise N according to strategy action = self._strategy.select_action( self.get_next_action(state)) + self._strategy.decay(self._current_episode) # Execute action a and observe reward r and next state ns next_state, reward, terminal, _ = \ @@ -545,10 +545,10 @@ class DdpgAgent(Agent): # with critic parameters tmp_critic = self._copy_critic() with autograd.record(): - actor_qvalues = tmp_critic(states, self._actor(states)) # For maximizing qvalues we have to multiply with -1 # as we use a minimizer - actor_loss = -1 * actor_qvalues.mean() + actor_loss = -tmp_critic( + states, self._actor(states)).mean() actor_loss.backward() trainer_actor.step(self._minibatch_size) @@ -566,7 +566,7 @@ class DdpgAgent(Agent): episode_actor_loss +=\ np.sum(actor_loss.asnumpy()) / self._minibatch_size episode_avg_q_value +=\ - np.sum(actor_qvalues.asnumpy()) / self._minibatch_size + np.sum(qvalues.asnumpy()) / self._minibatch_size training_steps += 1 @@ -594,7 +594,6 @@ class DdpgAgent(Agent): self._strategy.cur_eps, episode_reward) self._do_snapshot_if_in_interval(self._current_episode) - self._strategy.decay(self._current_episode) if self._is_target_reached(avg_reward): self._logger.info( @@ -605,7 +604,7 @@ class DdpgAgent(Agent): self._evaluate() self.save_parameters(episode=self._current_episode) - self.save_best_network(os.path.join(self._output_directory, 'best')) + self.export_best_network() self._training_stats.save_stats(self._output_directory) self._logger.info('--------- Training finished ---------') return True @@ -641,6 +640,359 @@ class DdpgAgent(Agent): self._critic, self._state_dim, self._action_dim, ctx=self._ctx) +class TwinDelayedDdpgAgent(DdpgAgent): + def __init__( + self, + actor, + critic, + environment, + replay_memory_params, + strategy_params, + state_dim, + action_dim, + soft_target_update_rate=.001, + actor_optimizer='adam', + actor_optimizer_params={'learning_rate': 0.0001}, + critic_optimizer='adam', + critic_optimizer_params={'learning_rate': 0.001}, + ctx=None, + discount_factor=.9, + training_episodes=50, + start_training=20, + train_interval=1, + snapshot_interval=200, + agent_name='DdpgAgent', + max_episode_step=9999, + evaluation_samples=100, + output_directory='model_parameters', + verbose=True, + target_score=None, + policy_noise=0.2, + noise_clip=0.5, + policy_delay=2 + ): + super(TwinDelayedDdpgAgent, self).__init__( + environment=environment, replay_memory_params=replay_memory_params, + strategy_params=strategy_params, state_dim=state_dim, + action_dim=action_dim, ctx=ctx, discount_factor=discount_factor, + training_episodes=training_episodes, start_training=start_training, + train_interval=train_interval, + snapshot_interval=snapshot_interval, agent_name=agent_name, + max_episode_step=max_episode_step, + output_directory=output_directory, verbose=verbose, + target_score=target_score, evaluation_samples=evaluation_samples, + critic=critic, soft_target_update_rate=soft_target_update_rate, + actor=actor, actor_optimizer=actor_optimizer, + actor_optimizer_params=actor_optimizer_params, + critic_optimizer=critic_optimizer, + critic_optimizer_params=critic_optimizer_params) + + self._policy_noise = policy_noise + self._noise_clip = noise_clip + self._policy_delay = policy_delay + + self._critic2 = self._critic.__class__() + self._critic2.collect_params().initialize( + mx.init.Normal(), ctx=self._ctx) + self._critic2.hybridize() + self._critic2(nd.ones((1,) + state_dim, ctx=self._ctx), + nd.ones((1,) + action_dim, ctx=self._ctx)) + + self._critic2_target = self._copy_critic2() + + self._critic2_optimizer = critic_optimizer + self._critic2_optimizer_params = self._adjust_optimizer_params( + critic_optimizer_params) + + def _make_pickle_ready(self, session_dir): + super(TwinDelayedDdpgAgent, self)._make_pickle_ready(session_dir) + self._export_net(self._critic2, 'critic2', filedir=session_dir) + self._critic2 = None + self._export_net( + self._critic2_target, 'critic2_target', filedir=session_dir) + self._critic2_target = None + + @classmethod + def resume_from_session(cls, session_dir, actor, critic, environment): + import pickle + if not os.path.exists(session_dir): + raise ValueError('Session directory does not exist') + + files = dict() + files['agent'] = os.path.join(session_dir, 'agent.p') + files['best_net_params'] = os.path.join(session_dir, 'best_net.params') + files['actor_net_params'] = os.path.join(session_dir, 'actor.params') + files['actor_target_net_params'] = os.path.join( + session_dir, 'actor_target.params') + files['critic_net_params'] = os.path.join(session_dir, 'critic.params') + files['critic_target_net_params'] = os.path.join( + session_dir, 'critic_target.params') + files['critic2_net_params'] = os.path.join( + session_dir, 'critic2.params') + files['critic2_target_net_params'] = os.path.join( + session_dir, 'critic2_target.params') + + for file in files.values(): + if not os.path.exists(file): + raise ValueError( + 'Session directory is not complete: {} is missing' + .format(file)) + + with open(files['agent'], 'rb') as f: + agent = pickle.load(f) + + agent._environment = environment + + agent._actor = actor + agent._actor.load_parameters(files['actor_net_params'], agent._ctx) + agent._actor.hybridize() + agent._actor(nd.random_normal( + shape=((1,) + agent._state_dim), ctx=agent._ctx)) + + agent._best_net = copy_net(agent._actor, agent._state_dim, agent._ctx) + agent._best_net.load_parameters(files['best_net_params'], agent._ctx) + + agent._actor_target = copy_net( + agent._actor, agent._state_dim, agent._ctx) + agent._actor_target.load_parameters(files['actor_target_net_params']) + + agent._critic = critic + agent._critic.load_parameters(files['critic_net_params'], agent._ctx) + agent._critic.hybridize() + agent._critic( + nd.random_normal(shape=((1,) + agent._state_dim), ctx=agent._ctx), + nd.random_normal(shape=((1,) + agent._action_dim), ctx=agent._ctx)) + + agent._critic_target = copy_net_with_two_inputs( + agent._critic, agent._state_dim, agent._action_dim, agent._ctx) + agent._critic_target.load_parameters(files['critic_target_net_params']) + + agent._critic2 = copy_net_with_two_inputs( + agent._critic, agent._state_dim, agent._action_dim, agent._ctx) + agent._critic2.load_parameters(files['critic2_net_params'], agent._ctx) + agent._critic2.hybridize() + agent._critic2( + nd.random_normal(shape=((1,) + agent._state_dim), ctx=agent._ctx), + nd.random_normal(shape=((1,) + agent._action_dim), ctx=agent._ctx)) + + agent._critic2_target = copy_net_with_two_inputs( + agent._critic2, agent._state_dim, agent._action_dim, agent._ctx) + agent._critic2_target.load_parameters( + files['critic2_target_net_params']) + + agent._logger = ArchLogger.get_logger() + agent._training_stats.logger = ArchLogger.get_logger() + agent._logger.info('Agent was retrieved; Training can be continued') + + return agent + + def _copy_critic2(self): + assert self._critic2 is not None + assert self._ctx is not None + assert type(self._state_dim) is tuple + assert type(self._action_dim) is tuple + return copy_net_with_two_inputs( + self._critic2, self._state_dim, self._action_dim, ctx=self._ctx) + + def train(self, episodes=None): + self.save_config_file() + self._logger.info("--- Start TwinDelayedDDPG training ---") + episodes = \ + episodes if episodes is not None else self._training_episodes + + resume = (self._current_episode > 0) + if resume: + self._logger.info("Training session resumed") + self._logger.info( + "Starting from episode {}".format(self._current_episode)) + else: + self._training_stats = DdpgTrainingStats(episodes) + + # Initialize target Q1' and Q2' and mu' + self._actor_target = self._copy_actor() + self._critic_target = self._copy_critic() + self._critic2_target = self._copy_critic2() + + # Initialize l2 loss for critic network + l2_loss = gluon.loss.L2Loss() + + # Initialize critic and actor trainer + trainer_actor = gluon.Trainer( + self._actor.collect_params(), self._actor_optimizer, + self._actor_optimizer_params) + trainer_critic = gluon.Trainer( + self._critic.collect_params(), self._critic_optimizer, + self._critic_optimizer_params) + trainer_critic2 = gluon.Trainer( + self._critic2.collect_params(), self._critic2_optimizer, + self._critic2_optimizer_params) + + # For episode=1..n + while self._current_episode < episodes: + # Check interrupt flag + if self._check_interrupt_routine(): + return False + + # Initialize new episode + step = 0 + episode_reward = 0 + start = time.time() + episode_critic_loss = 0 + episode_actor_loss = 0 + episode_avg_q_value = 0 + training_steps = 0 + actor_updates = 0 + + # Get initialial observation state s + state = self._environment.reset() + + # For step=1..T + while step < self._max_episode_step: + # Select an action a = mu(s) + N(step) according to current + # actor and exploration noise N according to strategy + action = self._strategy.select_action( + self.get_next_action(state)) + self._strategy.decay(self._current_episode) + + # Execute action a and observe reward r and next state ns + next_state, reward, terminal, _ = \ + self._environment.step(action) + + self._logger.debug( + 'Applied action {} with reward {}'.format(action, reward)) + + # Store transition (s,a,r,ns) in replay buffer + self._memory.append( + state, action, reward, next_state, terminal) + + if self._do_training(): + # Sample random minibatch of b transitions + # (s_i, a_i, r_i, s_(i+1)) from replay buffer + states, actions, rewards, next_states, terminals =\ + self._sample_from_memory() + + clipped_noise = nd.array( + np.clip( + np.random.normal( + loc=0, scale=self._policy_noise, + size=self._minibatch_size + ).reshape(self._minibatch_size, 1), + -self._noise_clip, + self._noise_clip + ), + ctx=self._ctx + ) + target_action = np.clip( + self._actor_target(next_states) + clipped_noise, + self._strategy._action_low, + self._strategy._action_high) + + rewards = rewards.reshape(self._minibatch_size, 1) + terminals = terminals.reshape(self._minibatch_size, 1) + + target_qvalues1 = self._critic_target(next_states, + target_action) + target_qvalues2 = self._critic2_target(next_states, + target_action) + target_qvalues = nd.minimum(target_qvalues1, + target_qvalues2) + y = rewards + (1 - terminals) * self._discount_factor\ + * target_qvalues + + with autograd.record(): + qvalues1 = self._critic(states, actions) + critic1_loss = l2_loss(qvalues1, y) + critic1_loss.backward() + trainer_critic.step(self._minibatch_size) + + with autograd.record(): + qvalues2 = self._critic2(states, actions) + critic2_loss = l2_loss(qvalues2, y) + critic2_loss.backward() + trainer_critic2.step(self._minibatch_size) + + critic_loss = (critic1_loss.mean() + critic2_loss.mean())/2 + + if self._total_steps % self._policy_delay == 0: + tmp_critic = self._copy_critic() + with autograd.record(): + actor_loss = -tmp_critic( + states, self._actor(states)).mean() + actor_loss.backward() + trainer_actor.step(self._minibatch_size) + + # Update target networks: + self._actor_target = self._soft_update( + self._actor, self._actor_target, + self._soft_target_update_rate) + self._critic_target = self._soft_update( + self._critic, self._critic_target, + self._soft_target_update_rate) + self._critic2_target = self._soft_update( + self._critic2, self._critic2_target, + self._soft_target_update_rate) + + actor_updates = actor_updates + 1 + else: + actor_loss = nd.array([0], ctx=self._ctx) + + # Update statistics + episode_critic_loss +=\ + np.sum(critic_loss.asnumpy()) / self._minibatch_size + episode_actor_loss += 0 if actor_updates == 0 else\ + np.sum(actor_loss.asnumpy()[0]) + episode_avg_q_value +=\ + np.sum(target_qvalues.asnumpy()) / self._minibatch_size + + training_steps += 1 + + episode_reward += reward + step += 1 + self._total_steps += 1 + state = next_state + + if terminal: + # Reset the strategy + self._strategy.reset() + break + + # Log the episode results + episode_actor_loss = 0 if actor_updates == 0\ + else (episode_actor_loss / actor_updates) + episode_critic_loss = 0 if training_steps == 0\ + else (episode_critic_loss / training_steps) + episode_avg_q_value = 0 if actor_updates == 0\ + else (episode_avg_q_value / training_steps) + + avg_reward = self._training_stats.log_episode( + self._current_episode, start, training_steps, + episode_actor_loss, episode_critic_loss, episode_avg_q_value, + self._strategy.cur_eps, episode_reward) + + self._do_snapshot_if_in_interval(self._current_episode) + + if self._is_target_reached(avg_reward): + self._logger.info( + 'Target score is reached in average; Training is stopped') + break + + self._current_episode += 1 + + self._evaluate() + self.save_parameters(episode=self._current_episode) + self.export_best_network() + self._training_stats.save_stats(self._output_directory) + self._logger.info('--------- Training finished ---------') + return True + + def _make_config_dict(self): + config = super(TwinDelayedDdpgAgent, self)._make_config_dict() + config['policy_noise'] = self._policy_noise + config['noise_clip'] = self._noise_clip + config['policy_delay'] = self._policy_delay + return config + + class DqnAgent(Agent): def __init__( self, @@ -652,7 +1004,7 @@ class DqnAgent(Agent): action_dim, ctx=None, discount_factor=.9, - loss_function='euclidean', + loss_function='l2', optimizer='rmsprop', optimizer_params={'learning_rate': 0.09}, training_episodes=50, @@ -691,6 +1043,14 @@ class DqnAgent(Agent): self._double_dqn = double_dqn self._use_fix_target = use_fix_target + # Build memory buffer for discrete actions + replay_memory_params['state_dim'] = state_dim + replay_memory_params['action_dim'] = (1,) + self._replay_memory_params = replay_memory_params + rm_builder = ReplayMemoryBuilder() + self._memory = rm_builder.build_by_params(**replay_memory_params) + self._minibatch_size = self._memory.sample_size + # Initialize best network self._best_net = copy_net(self._qnet, self._state_dim, self._ctx) self._best_avg_score = -np.infty @@ -740,10 +1100,10 @@ class DqnAgent(Agent): def _make_pickle_ready(self, session_dir): super(DqnAgent, self)._make_pickle_ready(session_dir) - self._save_net(self._qnet, 'current_qnet') - self._save_net(self._qnet, 'qnet', session_dir) + self._export_net(self._qnet, 'current_qnet') + self._export_net(self._qnet, 'qnet', filedir=session_dir) self._qnet = None - self._save_net(self._target_qnet, 'target_net', session_dir) + self._export_net(self._target_qnet, 'target_net', filedir=session_dir) self._target_qnet = None def get_q_values(self, state, with_best=False): @@ -839,6 +1199,7 @@ class DqnAgent(Agent): # 1. Choose an action based on current game state and policy q_values = self._qnet(nd.array([state], ctx=self._ctx)) action = self._strategy.select_action(q_values[0]) + self._strategy.decay(self._current_episode) # 2. Play the game for a single step next_state, reward, terminal, _ =\ @@ -846,7 +1207,7 @@ class DqnAgent(Agent): # 3. Store transition in replay memory self._memory.append( - state, action, reward, next_state, terminal) + state, [action], reward, next_state, terminal) # 4. Train the network if in interval if self._do_training(): @@ -875,8 +1236,6 @@ class DqnAgent(Agent): self._current_episode, start, training_steps, episode_loss, self._strategy.cur_eps, episode_reward) - self._strategy.decay(self._current_episode) - if self._is_target_reached(avg_reward): self._logger.info( 'Target score is reached in average; Training is stopped') @@ -886,7 +1245,7 @@ class DqnAgent(Agent): self._evaluate() self.save_parameters(episode=self._current_episode) - self.save_best_network(os.path.join(self._output_directory, 'best')) + self.export_best_network() self._training_stats.save_stats(self._output_directory) self._logger.info('--------- Training finished ---------') return True @@ -902,7 +1261,8 @@ class DqnAgent(Agent): return config def save_parameters(self, episode): - self._save_parameters(self._qnet, episode=episode) + self._export_net( + self._qnet, self._agent_name + '_qnet', episode=episode) def _save_current_as_best_net(self): self._best_net = copy_net( diff --git a/src/test/resources/target_code/ddpg/reinforcement_learning/strategy.py b/src/test/resources/target_code/ddpg/reinforcement_learning/strategy.py index 8e4c279e5569d5b1505ee4fb9f6e9136a40c30bb..d2d2386658c56c5ebb6d654468669a2664622305 100644 --- a/src/test/resources/target_code/ddpg/reinforcement_learning/strategy.py +++ b/src/test/resources/target_code/ddpg/reinforcement_learning/strategy.py @@ -13,18 +13,21 @@ class StrategyBuilder(object): epsilon_decay_method='no', epsilon_decay=0.0, epsilon_decay_start=0, + epsilon_decay_per_step=False, action_dim=None, action_low=None, action_high=None, mu=0.0, theta=0.5, - sigma=0.3 + sigma=0.3, + noise_variance=0.1 ): if epsilon_decay_method == 'linear': decay = LinearDecay( eps_decay=epsilon_decay, min_eps=min_epsilon, - decay_start=epsilon_decay_start) + decay_start=epsilon_decay_start, + decay_per_step=epsilon_decay_per_step) else: decay = NoDecay() @@ -44,6 +47,13 @@ class StrategyBuilder(object): return OrnsteinUhlenbeckStrategy( action_dim, action_low, action_high, epsilon, mu, theta, sigma, decay) + elif method == 'gaussian': + assert action_dim is not None + assert action_low is not None + assert action_high is not None + assert noise_variance is not None + return GaussianNoiseStrategy(action_dim, action_low, action_high, + epsilon, noise_variance, decay) else: assert action_dim is not None assert len(action_dim) == 1 @@ -70,17 +80,27 @@ class NoDecay(BaseDecay): class LinearDecay(BaseDecay): - def __init__(self, eps_decay, min_eps=0, decay_start=0): + def __init__(self, eps_decay, min_eps=0, decay_start=0, decay_per_step=False): super(LinearDecay, self).__init__() self.eps_decay = eps_decay self.min_eps = min_eps self.decay_start = decay_start + self.decay_per_step = decay_per_step + self.last_episode = -1 - def decay(self, cur_eps, episode): - if episode < self.decay_start: - return cur_eps + def do_decay(self, episode): + if self.decay_per_step: + do = (episode >= self.decay_start) else: + do = ((self.last_episode != episode) and (episode >= self.decay_start)) + self.last_episode = episode + return do + + def decay(self, cur_eps, episode): + if self.do_decay(episode): return max(cur_eps - self.eps_decay, self.min_eps) + else: + return cur_eps class BaseStrategy(object): @@ -170,3 +190,29 @@ class OrnsteinUhlenbeckStrategy(BaseStrategy): noise = self._evolve_state() action = (1.0 - self.cur_eps) * values + (self.cur_eps * noise) return np.clip(action, self._action_low, self._action_high) + + +class GaussianNoiseStrategy(BaseStrategy): + def __init__( + self, + action_dim, + action_low, + action_high, + eps, + noise_variance, + decay=NoDecay() + ): + super(GaussianNoiseStrategy, self).__init__(decay) + self.eps = eps + self.cur_eps = eps + + self._action_dim = action_dim + self._action_low = action_low + self._action_high = action_high + + self._noise_variance = noise_variance + + def select_action(self, values): + noise = np.random.normal(loc=0.0, scale=self._noise_variance, size=self._action_dim) + action = values + self.cur_eps * noise + return np.clip(action, self._action_low, self._action_high) diff --git a/src/test/resources/target_code/ddpg/reinforcement_learning/util.py b/src/test/resources/target_code/ddpg/reinforcement_learning/util.py index 78e7795fd290f99980c689ad1c76f274a821f830..1d8b38c4a32a1cc6c0e364054e746e9115d7d05b 100644 --- a/src/test/resources/target_code/ddpg/reinforcement_learning/util.py +++ b/src/test/resources/target_code/ddpg/reinforcement_learning/util.py @@ -11,8 +11,8 @@ import cnnarch_logger LOSS_FUNCTIONS = { 'l1': gluon.loss.L1Loss(), - 'euclidean': gluon.loss.L2Loss(), - 'huber_loss': gluon.loss.HuberLoss(), + 'l2': gluon.loss.L2Loss(), + 'huber': gluon.loss.HuberLoss(), 'softmax_cross_entropy': gluon.loss.SoftmaxCrossEntropyLoss(), 'sigmoid_cross_entropy': gluon.loss.SigmoidBinaryCrossEntropyLoss()} diff --git a/src/test/resources/target_code/reinforcement_learning/agent.py b/src/test/resources/target_code/reinforcement_learning/agent.py index a93e743dde9383d9883ea69e6cd6b4d350f46249..148c89848b8d990b5f635c3976fd15e71a608893 100644 --- a/src/test/resources/target_code/reinforcement_learning/agent.py +++ b/src/test/resources/target_code/reinforcement_learning/agent.py @@ -124,10 +124,10 @@ class Agent(object): def _make_pickle_ready(self, session_dir): del self._training_stats.logger - self._logger = None self._environment.close() self._environment = None - self._save_net(self._best_net, 'best_net', session_dir) + self._export_net(self._best_net, 'best_net', filedir=session_dir) + self._logger = None self._best_net = None def _make_config_dict(self): @@ -258,26 +258,22 @@ class Agent(object): return self._target_score is not None\ and avg_reward > self._target_score - def _save_parameters(self, net, episode=None, filename='dqn-agent-params'): + def _export_net(self, net, filename, filedir=None, episode=None): assert self._output_directory assert isinstance(net, gluon.HybridBlock) make_directory_if_not_exist(self._output_directory) + filedir = self._output_directory if filedir is None else filedir + filename = os.path.join(filedir, filename) - if(episode is not None): - self._logger.info( - 'Saving model parameters after episode %d' % episode) + if episode is not None: filename = filename + '-ep{}'.format(episode) - else: - self._logger.info('Saving model parameters') - self._save_net(net, filename) - def _save_net(self, net, filename, filedir=None): - filedir = self._output_directory if filedir is None else filedir - filename = os.path.join(filedir, filename) - net.save_parameters(filename + '.params') net.export(filename, epoch=0) + net.save_parameters(filename + '.params') - def save_best_network(self, path, epoch=0): + def export_best_network(self, path=None, epoch=0): + path = os.path.join(self._output_directory, 'best_network')\ + if path is None else path self._logger.info( 'Saving best network with average reward of {}'.format( self._best_avg_score)) @@ -373,15 +369,17 @@ class DdpgAgent(Agent): def _make_pickle_ready(self, session_dir): super(DdpgAgent, self)._make_pickle_ready(session_dir) - self._save_net(self._actor, 'current_actor') + self._export_net(self._actor, 'current_actor') - self._save_net(self._actor, 'actor', session_dir) + self._export_net(self._actor, 'actor', filedir=session_dir) self._actor = None - self._save_net(self._critic, 'critic', session_dir) + self._export_net(self._critic, 'critic', filedir=session_dir) self._critic = None - self._save_net(self._actor_target, 'actor_target', session_dir) + self._export_net( + self._actor_target, 'actor_target', filedir=session_dir) self._actor_target = None - self._save_net(self._critic_target, 'critic_target', session_dir) + self._export_net( + self._critic_target, 'critic_target', filedir=session_dir) self._critic_target = None @classmethod @@ -449,7 +447,8 @@ class DdpgAgent(Agent): return action[0].asnumpy() def save_parameters(self, episode): - self._save_parameters(self._actor, episode=episode) + self._export_net( + self._actor, self._agent_name + '_actor', episode=episode) def train(self, episodes=None): self.save_config_file() @@ -504,6 +503,7 @@ class DdpgAgent(Agent): # actor and exploration noise N according to strategy action = self._strategy.select_action( self.get_next_action(state)) + self._strategy.decay(self._current_episode) # Execute action a and observe reward r and next state ns next_state, reward, terminal, _ = \ @@ -545,10 +545,10 @@ class DdpgAgent(Agent): # with critic parameters tmp_critic = self._copy_critic() with autograd.record(): - actor_qvalues = tmp_critic(states, self._actor(states)) # For maximizing qvalues we have to multiply with -1 # as we use a minimizer - actor_loss = -1 * actor_qvalues + actor_loss = -tmp_critic( + states, self._actor(states)).mean() actor_loss.backward() trainer_actor.step(self._minibatch_size) @@ -566,7 +566,7 @@ class DdpgAgent(Agent): episode_actor_loss +=\ np.sum(actor_loss.asnumpy()) / self._minibatch_size episode_avg_q_value +=\ - np.sum(actor_qvalues.asnumpy()) / self._minibatch_size + np.sum(qvalues.asnumpy()) / self._minibatch_size training_steps += 1 @@ -594,7 +594,6 @@ class DdpgAgent(Agent): self._strategy.cur_eps, episode_reward) self._do_snapshot_if_in_interval(self._current_episode) - self._strategy.decay(self._current_episode) if self._is_target_reached(avg_reward): self._logger.info( @@ -605,7 +604,7 @@ class DdpgAgent(Agent): self._evaluate() self.save_parameters(episode=self._current_episode) - self.save_best_network(os.path.join(self._output_directory, 'best')) + self.export_best_network() self._training_stats.save_stats(self._output_directory) self._logger.info('--------- Training finished ---------') return True @@ -641,6 +640,359 @@ class DdpgAgent(Agent): self._critic, self._state_dim, self._action_dim, ctx=self._ctx) +class TwinDelayedDdpgAgent(DdpgAgent): + def __init__( + self, + actor, + critic, + environment, + replay_memory_params, + strategy_params, + state_dim, + action_dim, + soft_target_update_rate=.001, + actor_optimizer='adam', + actor_optimizer_params={'learning_rate': 0.0001}, + critic_optimizer='adam', + critic_optimizer_params={'learning_rate': 0.001}, + ctx=None, + discount_factor=.9, + training_episodes=50, + start_training=20, + train_interval=1, + snapshot_interval=200, + agent_name='DdpgAgent', + max_episode_step=9999, + evaluation_samples=100, + output_directory='model_parameters', + verbose=True, + target_score=None, + policy_noise=0.2, + noise_clip=0.5, + policy_delay=2 + ): + super(TwinDelayedDdpgAgent, self).__init__( + environment=environment, replay_memory_params=replay_memory_params, + strategy_params=strategy_params, state_dim=state_dim, + action_dim=action_dim, ctx=ctx, discount_factor=discount_factor, + training_episodes=training_episodes, start_training=start_training, + train_interval=train_interval, + snapshot_interval=snapshot_interval, agent_name=agent_name, + max_episode_step=max_episode_step, + output_directory=output_directory, verbose=verbose, + target_score=target_score, evaluation_samples=evaluation_samples, + critic=critic, soft_target_update_rate=soft_target_update_rate, + actor=actor, actor_optimizer=actor_optimizer, + actor_optimizer_params=actor_optimizer_params, + critic_optimizer=critic_optimizer, + critic_optimizer_params=critic_optimizer_params) + + self._policy_noise = policy_noise + self._noise_clip = noise_clip + self._policy_delay = policy_delay + + self._critic2 = self._critic.__class__() + self._critic2.collect_params().initialize( + mx.init.Normal(), ctx=self._ctx) + self._critic2.hybridize() + self._critic2(nd.ones((1,) + state_dim, ctx=self._ctx), + nd.ones((1,) + action_dim, ctx=self._ctx)) + + self._critic2_target = self._copy_critic2() + + self._critic2_optimizer = critic_optimizer + self._critic2_optimizer_params = self._adjust_optimizer_params( + critic_optimizer_params) + + def _make_pickle_ready(self, session_dir): + super(TwinDelayedDdpgAgent, self)._make_pickle_ready(session_dir) + self._export_net(self._critic2, 'critic2', filedir=session_dir) + self._critic2 = None + self._export_net( + self._critic2_target, 'critic2_target', filedir=session_dir) + self._critic2_target = None + + @classmethod + def resume_from_session(cls, session_dir, actor, critic, environment): + import pickle + if not os.path.exists(session_dir): + raise ValueError('Session directory does not exist') + + files = dict() + files['agent'] = os.path.join(session_dir, 'agent.p') + files['best_net_params'] = os.path.join(session_dir, 'best_net.params') + files['actor_net_params'] = os.path.join(session_dir, 'actor.params') + files['actor_target_net_params'] = os.path.join( + session_dir, 'actor_target.params') + files['critic_net_params'] = os.path.join(session_dir, 'critic.params') + files['critic_target_net_params'] = os.path.join( + session_dir, 'critic_target.params') + files['critic2_net_params'] = os.path.join( + session_dir, 'critic2.params') + files['critic2_target_net_params'] = os.path.join( + session_dir, 'critic2_target.params') + + for file in files.values(): + if not os.path.exists(file): + raise ValueError( + 'Session directory is not complete: {} is missing' + .format(file)) + + with open(files['agent'], 'rb') as f: + agent = pickle.load(f) + + agent._environment = environment + + agent._actor = actor + agent._actor.load_parameters(files['actor_net_params'], agent._ctx) + agent._actor.hybridize() + agent._actor(nd.random_normal( + shape=((1,) + agent._state_dim), ctx=agent._ctx)) + + agent._best_net = copy_net(agent._actor, agent._state_dim, agent._ctx) + agent._best_net.load_parameters(files['best_net_params'], agent._ctx) + + agent._actor_target = copy_net( + agent._actor, agent._state_dim, agent._ctx) + agent._actor_target.load_parameters(files['actor_target_net_params']) + + agent._critic = critic + agent._critic.load_parameters(files['critic_net_params'], agent._ctx) + agent._critic.hybridize() + agent._critic( + nd.random_normal(shape=((1,) + agent._state_dim), ctx=agent._ctx), + nd.random_normal(shape=((1,) + agent._action_dim), ctx=agent._ctx)) + + agent._critic_target = copy_net_with_two_inputs( + agent._critic, agent._state_dim, agent._action_dim, agent._ctx) + agent._critic_target.load_parameters(files['critic_target_net_params']) + + agent._critic2 = copy_net_with_two_inputs( + agent._critic, agent._state_dim, agent._action_dim, agent._ctx) + agent._critic2.load_parameters(files['critic2_net_params'], agent._ctx) + agent._critic2.hybridize() + agent._critic2( + nd.random_normal(shape=((1,) + agent._state_dim), ctx=agent._ctx), + nd.random_normal(shape=((1,) + agent._action_dim), ctx=agent._ctx)) + + agent._critic2_target = copy_net_with_two_inputs( + agent._critic2, agent._state_dim, agent._action_dim, agent._ctx) + agent._critic2_target.load_parameters( + files['critic2_target_net_params']) + + agent._logger = ArchLogger.get_logger() + agent._training_stats.logger = ArchLogger.get_logger() + agent._logger.info('Agent was retrieved; Training can be continued') + + return agent + + def _copy_critic2(self): + assert self._critic2 is not None + assert self._ctx is not None + assert type(self._state_dim) is tuple + assert type(self._action_dim) is tuple + return copy_net_with_two_inputs( + self._critic2, self._state_dim, self._action_dim, ctx=self._ctx) + + def train(self, episodes=None): + self.save_config_file() + self._logger.info("--- Start TwinDelayedDDPG training ---") + episodes = \ + episodes if episodes is not None else self._training_episodes + + resume = (self._current_episode > 0) + if resume: + self._logger.info("Training session resumed") + self._logger.info( + "Starting from episode {}".format(self._current_episode)) + else: + self._training_stats = DdpgTrainingStats(episodes) + + # Initialize target Q1' and Q2' and mu' + self._actor_target = self._copy_actor() + self._critic_target = self._copy_critic() + self._critic2_target = self._copy_critic2() + + # Initialize l2 loss for critic network + l2_loss = gluon.loss.L2Loss() + + # Initialize critic and actor trainer + trainer_actor = gluon.Trainer( + self._actor.collect_params(), self._actor_optimizer, + self._actor_optimizer_params) + trainer_critic = gluon.Trainer( + self._critic.collect_params(), self._critic_optimizer, + self._critic_optimizer_params) + trainer_critic2 = gluon.Trainer( + self._critic2.collect_params(), self._critic2_optimizer, + self._critic2_optimizer_params) + + # For episode=1..n + while self._current_episode < episodes: + # Check interrupt flag + if self._check_interrupt_routine(): + return False + + # Initialize new episode + step = 0 + episode_reward = 0 + start = time.time() + episode_critic_loss = 0 + episode_actor_loss = 0 + episode_avg_q_value = 0 + training_steps = 0 + actor_updates = 0 + + # Get initialial observation state s + state = self._environment.reset() + + # For step=1..T + while step < self._max_episode_step: + # Select an action a = mu(s) + N(step) according to current + # actor and exploration noise N according to strategy + action = self._strategy.select_action( + self.get_next_action(state)) + self._strategy.decay(self._current_episode) + + # Execute action a and observe reward r and next state ns + next_state, reward, terminal, _ = \ + self._environment.step(action) + + self._logger.debug( + 'Applied action {} with reward {}'.format(action, reward)) + + # Store transition (s,a,r,ns) in replay buffer + self._memory.append( + state, action, reward, next_state, terminal) + + if self._do_training(): + # Sample random minibatch of b transitions + # (s_i, a_i, r_i, s_(i+1)) from replay buffer + states, actions, rewards, next_states, terminals =\ + self._sample_from_memory() + + clipped_noise = nd.array( + np.clip( + np.random.normal( + loc=0, scale=self._policy_noise, + size=self._minibatch_size + ).reshape(self._minibatch_size, 1), + -self._noise_clip, + self._noise_clip + ), + ctx=self._ctx + ) + target_action = np.clip( + self._actor_target(next_states) + clipped_noise, + self._strategy._action_low, + self._strategy._action_high) + + rewards = rewards.reshape(self._minibatch_size, 1) + terminals = terminals.reshape(self._minibatch_size, 1) + + target_qvalues1 = self._critic_target(next_states, + target_action) + target_qvalues2 = self._critic2_target(next_states, + target_action) + target_qvalues = nd.minimum(target_qvalues1, + target_qvalues2) + y = rewards + (1 - terminals) * self._discount_factor\ + * target_qvalues + + with autograd.record(): + qvalues1 = self._critic(states, actions) + critic1_loss = l2_loss(qvalues1, y) + critic1_loss.backward() + trainer_critic.step(self._minibatch_size) + + with autograd.record(): + qvalues2 = self._critic2(states, actions) + critic2_loss = l2_loss(qvalues2, y) + critic2_loss.backward() + trainer_critic2.step(self._minibatch_size) + + critic_loss = (critic1_loss.mean() + critic2_loss.mean())/2 + + if self._total_steps % self._policy_delay == 0: + tmp_critic = self._copy_critic() + with autograd.record(): + actor_loss = -tmp_critic( + states, self._actor(states)).mean() + actor_loss.backward() + trainer_actor.step(self._minibatch_size) + + # Update target networks: + self._actor_target = self._soft_update( + self._actor, self._actor_target, + self._soft_target_update_rate) + self._critic_target = self._soft_update( + self._critic, self._critic_target, + self._soft_target_update_rate) + self._critic2_target = self._soft_update( + self._critic2, self._critic2_target, + self._soft_target_update_rate) + + actor_updates = actor_updates + 1 + else: + actor_loss = nd.array([0], ctx=self._ctx) + + # Update statistics + episode_critic_loss +=\ + np.sum(critic_loss.asnumpy()) / self._minibatch_size + episode_actor_loss += 0 if actor_updates == 0 else\ + np.sum(actor_loss.asnumpy()[0]) + episode_avg_q_value +=\ + np.sum(target_qvalues.asnumpy()) / self._minibatch_size + + training_steps += 1 + + episode_reward += reward + step += 1 + self._total_steps += 1 + state = next_state + + if terminal: + # Reset the strategy + self._strategy.reset() + break + + # Log the episode results + episode_actor_loss = 0 if actor_updates == 0\ + else (episode_actor_loss / actor_updates) + episode_critic_loss = 0 if training_steps == 0\ + else (episode_critic_loss / training_steps) + episode_avg_q_value = 0 if actor_updates == 0\ + else (episode_avg_q_value / training_steps) + + avg_reward = self._training_stats.log_episode( + self._current_episode, start, training_steps, + episode_actor_loss, episode_critic_loss, episode_avg_q_value, + self._strategy.cur_eps, episode_reward) + + self._do_snapshot_if_in_interval(self._current_episode) + + if self._is_target_reached(avg_reward): + self._logger.info( + 'Target score is reached in average; Training is stopped') + break + + self._current_episode += 1 + + self._evaluate() + self.save_parameters(episode=self._current_episode) + self.export_best_network() + self._training_stats.save_stats(self._output_directory) + self._logger.info('--------- Training finished ---------') + return True + + def _make_config_dict(self): + config = super(TwinDelayedDdpgAgent, self)._make_config_dict() + config['policy_noise'] = self._policy_noise + config['noise_clip'] = self._noise_clip + config['policy_delay'] = self._policy_delay + return config + + class DqnAgent(Agent): def __init__( self, @@ -652,7 +1004,7 @@ class DqnAgent(Agent): action_dim, ctx=None, discount_factor=.9, - loss_function='euclidean', + loss_function='l2', optimizer='rmsprop', optimizer_params={'learning_rate': 0.09}, training_episodes=50, @@ -691,6 +1043,14 @@ class DqnAgent(Agent): self._double_dqn = double_dqn self._use_fix_target = use_fix_target + # Build memory buffer for discrete actions + replay_memory_params['state_dim'] = state_dim + replay_memory_params['action_dim'] = (1,) + self._replay_memory_params = replay_memory_params + rm_builder = ReplayMemoryBuilder() + self._memory = rm_builder.build_by_params(**replay_memory_params) + self._minibatch_size = self._memory.sample_size + # Initialize best network self._best_net = copy_net(self._qnet, self._state_dim, self._ctx) self._best_avg_score = -np.infty @@ -740,10 +1100,10 @@ class DqnAgent(Agent): def _make_pickle_ready(self, session_dir): super(DqnAgent, self)._make_pickle_ready(session_dir) - self._save_net(self._qnet, 'current_qnet') - self._save_net(self._qnet, 'qnet', session_dir) + self._export_net(self._qnet, 'current_qnet') + self._export_net(self._qnet, 'qnet', filedir=session_dir) self._qnet = None - self._save_net(self._target_qnet, 'target_net', session_dir) + self._export_net(self._target_qnet, 'target_net', filedir=session_dir) self._target_qnet = None def get_q_values(self, state, with_best=False): @@ -839,6 +1199,7 @@ class DqnAgent(Agent): # 1. Choose an action based on current game state and policy q_values = self._qnet(nd.array([state], ctx=self._ctx)) action = self._strategy.select_action(q_values[0]) + self._strategy.decay(self._current_episode) # 2. Play the game for a single step next_state, reward, terminal, _ =\ @@ -846,7 +1207,7 @@ class DqnAgent(Agent): # 3. Store transition in replay memory self._memory.append( - state, action, reward, next_state, terminal) + state, [action], reward, next_state, terminal) # 4. Train the network if in interval if self._do_training(): @@ -875,8 +1236,6 @@ class DqnAgent(Agent): self._current_episode, start, training_steps, episode_loss, self._strategy.cur_eps, episode_reward) - self._strategy.decay(self._current_episode) - if self._is_target_reached(avg_reward): self._logger.info( 'Target score is reached in average; Training is stopped') @@ -886,7 +1245,7 @@ class DqnAgent(Agent): self._evaluate() self.save_parameters(episode=self._current_episode) - self.save_best_network(os.path.join(self._output_directory, 'best')) + self.export_best_network() self._training_stats.save_stats(self._output_directory) self._logger.info('--------- Training finished ---------') return True @@ -902,7 +1261,8 @@ class DqnAgent(Agent): return config def save_parameters(self, episode): - self._save_parameters(self._qnet, episode=episode) + self._export_net( + self._qnet, self._agent_name + '_qnet', episode=episode) def _save_current_as_best_net(self): self._best_net = copy_net( diff --git a/src/test/resources/target_code/ros-ddpg/CNNTrainer_rosActorNetwork.py b/src/test/resources/target_code/ros-ddpg/CNNTrainer_rosActorNetwork.py index 1ca02083ea2cdf6d4027636b11ee8bce77e0e921..261623f4fe95398dc34984e6560596ded0fcc04a 100644 --- a/src/test/resources/target_code/ros-ddpg/CNNTrainer_rosActorNetwork.py +++ b/src/test/resources/target_code/ros-ddpg/CNNTrainer_rosActorNetwork.py @@ -49,6 +49,7 @@ if __name__ == "__main__": 'state_topic': '/environment/state', 'action_topic': '/environment/action', 'reset_topic': '/environment/reset', + 'reward_topic': '/environment/reward', } env = reinforcement_learning.environment.RosEnvironment(**env_params) @@ -93,8 +94,8 @@ if __name__ == "__main__": 'snapshot_interval': 500, 'max_episode_step': 2000, 'evaluation_samples': 1000, - 'actor': actor_creator.net, - 'critic': critic_creator.net, + 'actor': actor_creator.networks[0], + 'critic': critic_creator.networks[0], 'soft_target_update_rate': 0.001, 'actor_optimizer': 'adam', 'actor_optimizer_params': { @@ -138,4 +139,4 @@ if __name__ == "__main__": train_successful = agent.train() if train_successful: - agent.save_best_network(actor_creator._model_dir_ + actor_creator._model_prefix_ + '_0_newest', epoch=0) + agent.export_best_network(path=actor_creator._model_dir_ + actor_creator._model_prefix_ + '_0_newest', epoch=0) diff --git a/src/test/resources/target_code/ros-ddpg/reinforcement_learning/agent.py b/src/test/resources/target_code/ros-ddpg/reinforcement_learning/agent.py index 6dbf9ef17065c25832844649b1c3bb16009f4e6f..148c89848b8d990b5f635c3976fd15e71a608893 100644 --- a/src/test/resources/target_code/ros-ddpg/reinforcement_learning/agent.py +++ b/src/test/resources/target_code/ros-ddpg/reinforcement_learning/agent.py @@ -124,10 +124,10 @@ class Agent(object): def _make_pickle_ready(self, session_dir): del self._training_stats.logger - self._logger = None self._environment.close() self._environment = None - self._save_net(self._best_net, 'best_net', session_dir) + self._export_net(self._best_net, 'best_net', filedir=session_dir) + self._logger = None self._best_net = None def _make_config_dict(self): @@ -258,26 +258,22 @@ class Agent(object): return self._target_score is not None\ and avg_reward > self._target_score - def _save_parameters(self, net, episode=None, filename='dqn-agent-params'): + def _export_net(self, net, filename, filedir=None, episode=None): assert self._output_directory assert isinstance(net, gluon.HybridBlock) make_directory_if_not_exist(self._output_directory) + filedir = self._output_directory if filedir is None else filedir + filename = os.path.join(filedir, filename) - if(episode is not None): - self._logger.info( - 'Saving model parameters after episode %d' % episode) + if episode is not None: filename = filename + '-ep{}'.format(episode) - else: - self._logger.info('Saving model parameters') - self._save_net(net, filename) - def _save_net(self, net, filename, filedir=None): - filedir = self._output_directory if filedir is None else filedir - filename = os.path.join(filedir, filename) - net.save_parameters(filename + '.params') net.export(filename, epoch=0) + net.save_parameters(filename + '.params') - def save_best_network(self, path, epoch=0): + def export_best_network(self, path=None, epoch=0): + path = os.path.join(self._output_directory, 'best_network')\ + if path is None else path self._logger.info( 'Saving best network with average reward of {}'.format( self._best_avg_score)) @@ -373,15 +369,17 @@ class DdpgAgent(Agent): def _make_pickle_ready(self, session_dir): super(DdpgAgent, self)._make_pickle_ready(session_dir) - self._save_net(self._actor, 'current_actor') + self._export_net(self._actor, 'current_actor') - self._save_net(self._actor, 'actor', session_dir) + self._export_net(self._actor, 'actor', filedir=session_dir) self._actor = None - self._save_net(self._critic, 'critic', session_dir) + self._export_net(self._critic, 'critic', filedir=session_dir) self._critic = None - self._save_net(self._actor_target, 'actor_target', session_dir) + self._export_net( + self._actor_target, 'actor_target', filedir=session_dir) self._actor_target = None - self._save_net(self._critic_target, 'critic_target', session_dir) + self._export_net( + self._critic_target, 'critic_target', filedir=session_dir) self._critic_target = None @classmethod @@ -449,7 +447,8 @@ class DdpgAgent(Agent): return action[0].asnumpy() def save_parameters(self, episode): - self._save_parameters(self._actor, episode=episode) + self._export_net( + self._actor, self._agent_name + '_actor', episode=episode) def train(self, episodes=None): self.save_config_file() @@ -504,6 +503,7 @@ class DdpgAgent(Agent): # actor and exploration noise N according to strategy action = self._strategy.select_action( self.get_next_action(state)) + self._strategy.decay(self._current_episode) # Execute action a and observe reward r and next state ns next_state, reward, terminal, _ = \ @@ -545,10 +545,10 @@ class DdpgAgent(Agent): # with critic parameters tmp_critic = self._copy_critic() with autograd.record(): - actor_qvalues = tmp_critic(states, self._actor(states)) # For maximizing qvalues we have to multiply with -1 # as we use a minimizer - actor_loss = -1 * actor_qvalues.mean() + actor_loss = -tmp_critic( + states, self._actor(states)).mean() actor_loss.backward() trainer_actor.step(self._minibatch_size) @@ -566,7 +566,7 @@ class DdpgAgent(Agent): episode_actor_loss +=\ np.sum(actor_loss.asnumpy()) / self._minibatch_size episode_avg_q_value +=\ - np.sum(actor_qvalues.asnumpy()) / self._minibatch_size + np.sum(qvalues.asnumpy()) / self._minibatch_size training_steps += 1 @@ -594,7 +594,6 @@ class DdpgAgent(Agent): self._strategy.cur_eps, episode_reward) self._do_snapshot_if_in_interval(self._current_episode) - self._strategy.decay(self._current_episode) if self._is_target_reached(avg_reward): self._logger.info( @@ -605,7 +604,7 @@ class DdpgAgent(Agent): self._evaluate() self.save_parameters(episode=self._current_episode) - self.save_best_network(os.path.join(self._output_directory, 'best')) + self.export_best_network() self._training_stats.save_stats(self._output_directory) self._logger.info('--------- Training finished ---------') return True @@ -641,6 +640,359 @@ class DdpgAgent(Agent): self._critic, self._state_dim, self._action_dim, ctx=self._ctx) +class TwinDelayedDdpgAgent(DdpgAgent): + def __init__( + self, + actor, + critic, + environment, + replay_memory_params, + strategy_params, + state_dim, + action_dim, + soft_target_update_rate=.001, + actor_optimizer='adam', + actor_optimizer_params={'learning_rate': 0.0001}, + critic_optimizer='adam', + critic_optimizer_params={'learning_rate': 0.001}, + ctx=None, + discount_factor=.9, + training_episodes=50, + start_training=20, + train_interval=1, + snapshot_interval=200, + agent_name='DdpgAgent', + max_episode_step=9999, + evaluation_samples=100, + output_directory='model_parameters', + verbose=True, + target_score=None, + policy_noise=0.2, + noise_clip=0.5, + policy_delay=2 + ): + super(TwinDelayedDdpgAgent, self).__init__( + environment=environment, replay_memory_params=replay_memory_params, + strategy_params=strategy_params, state_dim=state_dim, + action_dim=action_dim, ctx=ctx, discount_factor=discount_factor, + training_episodes=training_episodes, start_training=start_training, + train_interval=train_interval, + snapshot_interval=snapshot_interval, agent_name=agent_name, + max_episode_step=max_episode_step, + output_directory=output_directory, verbose=verbose, + target_score=target_score, evaluation_samples=evaluation_samples, + critic=critic, soft_target_update_rate=soft_target_update_rate, + actor=actor, actor_optimizer=actor_optimizer, + actor_optimizer_params=actor_optimizer_params, + critic_optimizer=critic_optimizer, + critic_optimizer_params=critic_optimizer_params) + + self._policy_noise = policy_noise + self._noise_clip = noise_clip + self._policy_delay = policy_delay + + self._critic2 = self._critic.__class__() + self._critic2.collect_params().initialize( + mx.init.Normal(), ctx=self._ctx) + self._critic2.hybridize() + self._critic2(nd.ones((1,) + state_dim, ctx=self._ctx), + nd.ones((1,) + action_dim, ctx=self._ctx)) + + self._critic2_target = self._copy_critic2() + + self._critic2_optimizer = critic_optimizer + self._critic2_optimizer_params = self._adjust_optimizer_params( + critic_optimizer_params) + + def _make_pickle_ready(self, session_dir): + super(TwinDelayedDdpgAgent, self)._make_pickle_ready(session_dir) + self._export_net(self._critic2, 'critic2', filedir=session_dir) + self._critic2 = None + self._export_net( + self._critic2_target, 'critic2_target', filedir=session_dir) + self._critic2_target = None + + @classmethod + def resume_from_session(cls, session_dir, actor, critic, environment): + import pickle + if not os.path.exists(session_dir): + raise ValueError('Session directory does not exist') + + files = dict() + files['agent'] = os.path.join(session_dir, 'agent.p') + files['best_net_params'] = os.path.join(session_dir, 'best_net.params') + files['actor_net_params'] = os.path.join(session_dir, 'actor.params') + files['actor_target_net_params'] = os.path.join( + session_dir, 'actor_target.params') + files['critic_net_params'] = os.path.join(session_dir, 'critic.params') + files['critic_target_net_params'] = os.path.join( + session_dir, 'critic_target.params') + files['critic2_net_params'] = os.path.join( + session_dir, 'critic2.params') + files['critic2_target_net_params'] = os.path.join( + session_dir, 'critic2_target.params') + + for file in files.values(): + if not os.path.exists(file): + raise ValueError( + 'Session directory is not complete: {} is missing' + .format(file)) + + with open(files['agent'], 'rb') as f: + agent = pickle.load(f) + + agent._environment = environment + + agent._actor = actor + agent._actor.load_parameters(files['actor_net_params'], agent._ctx) + agent._actor.hybridize() + agent._actor(nd.random_normal( + shape=((1,) + agent._state_dim), ctx=agent._ctx)) + + agent._best_net = copy_net(agent._actor, agent._state_dim, agent._ctx) + agent._best_net.load_parameters(files['best_net_params'], agent._ctx) + + agent._actor_target = copy_net( + agent._actor, agent._state_dim, agent._ctx) + agent._actor_target.load_parameters(files['actor_target_net_params']) + + agent._critic = critic + agent._critic.load_parameters(files['critic_net_params'], agent._ctx) + agent._critic.hybridize() + agent._critic( + nd.random_normal(shape=((1,) + agent._state_dim), ctx=agent._ctx), + nd.random_normal(shape=((1,) + agent._action_dim), ctx=agent._ctx)) + + agent._critic_target = copy_net_with_two_inputs( + agent._critic, agent._state_dim, agent._action_dim, agent._ctx) + agent._critic_target.load_parameters(files['critic_target_net_params']) + + agent._critic2 = copy_net_with_two_inputs( + agent._critic, agent._state_dim, agent._action_dim, agent._ctx) + agent._critic2.load_parameters(files['critic2_net_params'], agent._ctx) + agent._critic2.hybridize() + agent._critic2( + nd.random_normal(shape=((1,) + agent._state_dim), ctx=agent._ctx), + nd.random_normal(shape=((1,) + agent._action_dim), ctx=agent._ctx)) + + agent._critic2_target = copy_net_with_two_inputs( + agent._critic2, agent._state_dim, agent._action_dim, agent._ctx) + agent._critic2_target.load_parameters( + files['critic2_target_net_params']) + + agent._logger = ArchLogger.get_logger() + agent._training_stats.logger = ArchLogger.get_logger() + agent._logger.info('Agent was retrieved; Training can be continued') + + return agent + + def _copy_critic2(self): + assert self._critic2 is not None + assert self._ctx is not None + assert type(self._state_dim) is tuple + assert type(self._action_dim) is tuple + return copy_net_with_two_inputs( + self._critic2, self._state_dim, self._action_dim, ctx=self._ctx) + + def train(self, episodes=None): + self.save_config_file() + self._logger.info("--- Start TwinDelayedDDPG training ---") + episodes = \ + episodes if episodes is not None else self._training_episodes + + resume = (self._current_episode > 0) + if resume: + self._logger.info("Training session resumed") + self._logger.info( + "Starting from episode {}".format(self._current_episode)) + else: + self._training_stats = DdpgTrainingStats(episodes) + + # Initialize target Q1' and Q2' and mu' + self._actor_target = self._copy_actor() + self._critic_target = self._copy_critic() + self._critic2_target = self._copy_critic2() + + # Initialize l2 loss for critic network + l2_loss = gluon.loss.L2Loss() + + # Initialize critic and actor trainer + trainer_actor = gluon.Trainer( + self._actor.collect_params(), self._actor_optimizer, + self._actor_optimizer_params) + trainer_critic = gluon.Trainer( + self._critic.collect_params(), self._critic_optimizer, + self._critic_optimizer_params) + trainer_critic2 = gluon.Trainer( + self._critic2.collect_params(), self._critic2_optimizer, + self._critic2_optimizer_params) + + # For episode=1..n + while self._current_episode < episodes: + # Check interrupt flag + if self._check_interrupt_routine(): + return False + + # Initialize new episode + step = 0 + episode_reward = 0 + start = time.time() + episode_critic_loss = 0 + episode_actor_loss = 0 + episode_avg_q_value = 0 + training_steps = 0 + actor_updates = 0 + + # Get initialial observation state s + state = self._environment.reset() + + # For step=1..T + while step < self._max_episode_step: + # Select an action a = mu(s) + N(step) according to current + # actor and exploration noise N according to strategy + action = self._strategy.select_action( + self.get_next_action(state)) + self._strategy.decay(self._current_episode) + + # Execute action a and observe reward r and next state ns + next_state, reward, terminal, _ = \ + self._environment.step(action) + + self._logger.debug( + 'Applied action {} with reward {}'.format(action, reward)) + + # Store transition (s,a,r,ns) in replay buffer + self._memory.append( + state, action, reward, next_state, terminal) + + if self._do_training(): + # Sample random minibatch of b transitions + # (s_i, a_i, r_i, s_(i+1)) from replay buffer + states, actions, rewards, next_states, terminals =\ + self._sample_from_memory() + + clipped_noise = nd.array( + np.clip( + np.random.normal( + loc=0, scale=self._policy_noise, + size=self._minibatch_size + ).reshape(self._minibatch_size, 1), + -self._noise_clip, + self._noise_clip + ), + ctx=self._ctx + ) + target_action = np.clip( + self._actor_target(next_states) + clipped_noise, + self._strategy._action_low, + self._strategy._action_high) + + rewards = rewards.reshape(self._minibatch_size, 1) + terminals = terminals.reshape(self._minibatch_size, 1) + + target_qvalues1 = self._critic_target(next_states, + target_action) + target_qvalues2 = self._critic2_target(next_states, + target_action) + target_qvalues = nd.minimum(target_qvalues1, + target_qvalues2) + y = rewards + (1 - terminals) * self._discount_factor\ + * target_qvalues + + with autograd.record(): + qvalues1 = self._critic(states, actions) + critic1_loss = l2_loss(qvalues1, y) + critic1_loss.backward() + trainer_critic.step(self._minibatch_size) + + with autograd.record(): + qvalues2 = self._critic2(states, actions) + critic2_loss = l2_loss(qvalues2, y) + critic2_loss.backward() + trainer_critic2.step(self._minibatch_size) + + critic_loss = (critic1_loss.mean() + critic2_loss.mean())/2 + + if self._total_steps % self._policy_delay == 0: + tmp_critic = self._copy_critic() + with autograd.record(): + actor_loss = -tmp_critic( + states, self._actor(states)).mean() + actor_loss.backward() + trainer_actor.step(self._minibatch_size) + + # Update target networks: + self._actor_target = self._soft_update( + self._actor, self._actor_target, + self._soft_target_update_rate) + self._critic_target = self._soft_update( + self._critic, self._critic_target, + self._soft_target_update_rate) + self._critic2_target = self._soft_update( + self._critic2, self._critic2_target, + self._soft_target_update_rate) + + actor_updates = actor_updates + 1 + else: + actor_loss = nd.array([0], ctx=self._ctx) + + # Update statistics + episode_critic_loss +=\ + np.sum(critic_loss.asnumpy()) / self._minibatch_size + episode_actor_loss += 0 if actor_updates == 0 else\ + np.sum(actor_loss.asnumpy()[0]) + episode_avg_q_value +=\ + np.sum(target_qvalues.asnumpy()) / self._minibatch_size + + training_steps += 1 + + episode_reward += reward + step += 1 + self._total_steps += 1 + state = next_state + + if terminal: + # Reset the strategy + self._strategy.reset() + break + + # Log the episode results + episode_actor_loss = 0 if actor_updates == 0\ + else (episode_actor_loss / actor_updates) + episode_critic_loss = 0 if training_steps == 0\ + else (episode_critic_loss / training_steps) + episode_avg_q_value = 0 if actor_updates == 0\ + else (episode_avg_q_value / training_steps) + + avg_reward = self._training_stats.log_episode( + self._current_episode, start, training_steps, + episode_actor_loss, episode_critic_loss, episode_avg_q_value, + self._strategy.cur_eps, episode_reward) + + self._do_snapshot_if_in_interval(self._current_episode) + + if self._is_target_reached(avg_reward): + self._logger.info( + 'Target score is reached in average; Training is stopped') + break + + self._current_episode += 1 + + self._evaluate() + self.save_parameters(episode=self._current_episode) + self.export_best_network() + self._training_stats.save_stats(self._output_directory) + self._logger.info('--------- Training finished ---------') + return True + + def _make_config_dict(self): + config = super(TwinDelayedDdpgAgent, self)._make_config_dict() + config['policy_noise'] = self._policy_noise + config['noise_clip'] = self._noise_clip + config['policy_delay'] = self._policy_delay + return config + + class DqnAgent(Agent): def __init__( self, @@ -652,7 +1004,7 @@ class DqnAgent(Agent): action_dim, ctx=None, discount_factor=.9, - loss_function='euclidean', + loss_function='l2', optimizer='rmsprop', optimizer_params={'learning_rate': 0.09}, training_episodes=50, @@ -691,6 +1043,14 @@ class DqnAgent(Agent): self._double_dqn = double_dqn self._use_fix_target = use_fix_target + # Build memory buffer for discrete actions + replay_memory_params['state_dim'] = state_dim + replay_memory_params['action_dim'] = (1,) + self._replay_memory_params = replay_memory_params + rm_builder = ReplayMemoryBuilder() + self._memory = rm_builder.build_by_params(**replay_memory_params) + self._minibatch_size = self._memory.sample_size + # Initialize best network self._best_net = copy_net(self._qnet, self._state_dim, self._ctx) self._best_avg_score = -np.infty @@ -740,10 +1100,10 @@ class DqnAgent(Agent): def _make_pickle_ready(self, session_dir): super(DqnAgent, self)._make_pickle_ready(session_dir) - self._save_net(self._qnet, 'current_qnet') - self._save_net(self._qnet, 'qnet', session_dir) + self._export_net(self._qnet, 'current_qnet') + self._export_net(self._qnet, 'qnet', filedir=session_dir) self._qnet = None - self._save_net(self._target_qnet, 'target_net', session_dir) + self._export_net(self._target_qnet, 'target_net', filedir=session_dir) self._target_qnet = None def get_q_values(self, state, with_best=False): @@ -839,6 +1199,7 @@ class DqnAgent(Agent): # 1. Choose an action based on current game state and policy q_values = self._qnet(nd.array([state], ctx=self._ctx)) action = self._strategy.select_action(q_values[0]) + self._strategy.decay(self._current_episode) # 2. Play the game for a single step next_state, reward, terminal, _ =\ @@ -846,7 +1207,7 @@ class DqnAgent(Agent): # 3. Store transition in replay memory self._memory.append( - state, action, reward, next_state, terminal) + state, [action], reward, next_state, terminal) # 4. Train the network if in interval if self._do_training(): @@ -875,8 +1236,6 @@ class DqnAgent(Agent): self._current_episode, start, training_steps, episode_loss, self._strategy.cur_eps, episode_reward) - self._strategy.decay(self._current_episode) - if self._is_target_reached(avg_reward): self._logger.info( 'Target score is reached in average; Training is stopped') @@ -886,7 +1245,7 @@ class DqnAgent(Agent): self._evaluate() self.save_parameters(episode=self._current_episode) - self.save_best_network(os.path.join(self._output_directory, 'best')) + self.export_best_network() self._training_stats.save_stats(self._output_directory) self._logger.info('--------- Training finished ---------') return True @@ -902,7 +1261,8 @@ class DqnAgent(Agent): return config def save_parameters(self, episode): - self._save_parameters(self._qnet, episode=episode) + self._export_net( + self._qnet, self._agent_name + '_qnet', episode=episode) def _save_current_as_best_net(self): self._best_net = copy_net( diff --git a/src/test/resources/target_code/ros-ddpg/reinforcement_learning/environment.py b/src/test/resources/target_code/ros-ddpg/reinforcement_learning/environment.py index 20cfa808169c31af36f93c5dace28cf1af116a91..aef81280dd430a1de63cbd7064e8190800495697 100644 --- a/src/test/resources/target_code/ros-ddpg/reinforcement_learning/environment.py +++ b/src/test/resources/target_code/ros-ddpg/reinforcement_learning/environment.py @@ -2,29 +2,12 @@ import abc import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -import reward_rewardFunction_executor - -class RewardFunction(object): - def __init__(self): - self.__reward_wrapper = reward_rewardFunction_executor.reward_rewardFunction_executor() - self.__reward_wrapper.init() - - def reward(self, state, terminal): - s = state.astype('double') - t = bool(terminal) - inp = reward_rewardFunction_executor.reward_rewardFunction_input() - inp.state = s - inp.isTerminal = t - output = self.__reward_wrapper.execute(inp) - return output.reward - - class Environment: __metaclass__ = abc.ABCMeta def __init__(self): - self._reward_function = RewardFunction() + pass @abc.abstractmethod def reset(self): @@ -60,6 +43,8 @@ class RosEnvironment(Environment): self.__waiting_for_terminal_update = False self.__last_received_state = 0 self.__last_received_terminal = True + self.__last_received_reward = 0.0 + self.__waiting_for_reward_update = False rospy.loginfo("Initialize node {0}".format(ros_node_name)) @@ -77,6 +62,9 @@ class RosEnvironment(Environment): self.__terminal_state_subscriber = rospy.Subscriber(terminal_state_topic, Bool, self.__terminal_state_callback) rospy.loginfo('Terminal State Subscriber registered with topic {}'.format(terminal_state_topic)) + self.__reward_subscriber = rospy.Subscriber(reward_topic, Float32, self.__reward_callback) + rospy.loginfo('Reward Subscriber registered with topic {}'.format(reward_topic)) + rate = rospy.Rate(10) thread.start_new_thread(rospy.spin, ()) @@ -84,7 +72,6 @@ class RosEnvironment(Environment): def reset(self): self.__in_reset = True - time.sleep(0.5) reset_message = Bool() reset_message.data = True self.__waiting_for_state_update = True @@ -110,12 +97,14 @@ class RosEnvironment(Environment): self.__waiting_for_state_update = True self.__waiting_for_terminal_update = True + self.__waiting_for_reward_update = True self.__step_publisher.publish(action_rospy) self.__wait_for_new_state(self.__step_publisher, action_rospy) next_state = self.__last_received_state terminal = self.__last_received_terminal - reward = self.__calc_reward(next_state, terminal) - rospy.logdebug('Calculated reward: {}'.format(reward)) + reward = self.__last_received_reward + + logger.debug('Transition: ({}, {}, {}, {})'.format(action, reward, next_state, terminal)) return next_state, reward, terminal, 0 @@ -123,7 +112,7 @@ class RosEnvironment(Environment): time_of_timeout = time.time() + self.__timeout_in_s timeout_counter = 0 while(self.__waiting_for_state_update - or self.__waiting_for_terminal_update): + or self.__waiting_for_terminal_update or self.__waiting_for_reward_update): is_timeout = (time.time() > time_of_timeout) if (is_timeout): if timeout_counter < 3: @@ -134,22 +123,22 @@ class RosEnvironment(Environment): else: rospy.logerr("Timeout 3 times in a row: Terminate application") exit() - time.sleep(100/1000) + time.sleep(1/500) def close(self): rospy.signal_shutdown('Program ended!') def __state_callback(self, data): self.__last_received_state = np.array(data.data, dtype='float32').reshape((8,)) - rospy.logdebug('Received state: {}'.format(self.__last_received_state)) + logger.debug('Received state: {}'.format(self.__last_received_state)) self.__waiting_for_state_update = False def __terminal_state_callback(self, data): - self.__last_received_terminal = data.data - rospy.logdebug('Received terminal flag: {}'.format(self.__last_received_terminal)) - logger.debug('Received terminal: {}'.format(self.__last_received_terminal)) + self.__last_received_terminal = np.bool(data.data) + logger.debug('Received terminal flag: {}'.format(self.__last_received_terminal)) self.__waiting_for_terminal_update = False - def __calc_reward(self, state, terminal): - # C++ Wrapper call - return self._reward_function.reward(state, terminal) + def __reward_callback(self, data): + self.__last_received_reward = np.float32(data.data) + logger.debug('Received reward: {}'.format(self.__last_received_reward)) + self.__waiting_for_reward_update = False diff --git a/src/test/resources/target_code/ros-ddpg/reinforcement_learning/strategy.py b/src/test/resources/target_code/ros-ddpg/reinforcement_learning/strategy.py index 8e4c279e5569d5b1505ee4fb9f6e9136a40c30bb..d2d2386658c56c5ebb6d654468669a2664622305 100644 --- a/src/test/resources/target_code/ros-ddpg/reinforcement_learning/strategy.py +++ b/src/test/resources/target_code/ros-ddpg/reinforcement_learning/strategy.py @@ -13,18 +13,21 @@ class StrategyBuilder(object): epsilon_decay_method='no', epsilon_decay=0.0, epsilon_decay_start=0, + epsilon_decay_per_step=False, action_dim=None, action_low=None, action_high=None, mu=0.0, theta=0.5, - sigma=0.3 + sigma=0.3, + noise_variance=0.1 ): if epsilon_decay_method == 'linear': decay = LinearDecay( eps_decay=epsilon_decay, min_eps=min_epsilon, - decay_start=epsilon_decay_start) + decay_start=epsilon_decay_start, + decay_per_step=epsilon_decay_per_step) else: decay = NoDecay() @@ -44,6 +47,13 @@ class StrategyBuilder(object): return OrnsteinUhlenbeckStrategy( action_dim, action_low, action_high, epsilon, mu, theta, sigma, decay) + elif method == 'gaussian': + assert action_dim is not None + assert action_low is not None + assert action_high is not None + assert noise_variance is not None + return GaussianNoiseStrategy(action_dim, action_low, action_high, + epsilon, noise_variance, decay) else: assert action_dim is not None assert len(action_dim) == 1 @@ -70,17 +80,27 @@ class NoDecay(BaseDecay): class LinearDecay(BaseDecay): - def __init__(self, eps_decay, min_eps=0, decay_start=0): + def __init__(self, eps_decay, min_eps=0, decay_start=0, decay_per_step=False): super(LinearDecay, self).__init__() self.eps_decay = eps_decay self.min_eps = min_eps self.decay_start = decay_start + self.decay_per_step = decay_per_step + self.last_episode = -1 - def decay(self, cur_eps, episode): - if episode < self.decay_start: - return cur_eps + def do_decay(self, episode): + if self.decay_per_step: + do = (episode >= self.decay_start) else: + do = ((self.last_episode != episode) and (episode >= self.decay_start)) + self.last_episode = episode + return do + + def decay(self, cur_eps, episode): + if self.do_decay(episode): return max(cur_eps - self.eps_decay, self.min_eps) + else: + return cur_eps class BaseStrategy(object): @@ -170,3 +190,29 @@ class OrnsteinUhlenbeckStrategy(BaseStrategy): noise = self._evolve_state() action = (1.0 - self.cur_eps) * values + (self.cur_eps * noise) return np.clip(action, self._action_low, self._action_high) + + +class GaussianNoiseStrategy(BaseStrategy): + def __init__( + self, + action_dim, + action_low, + action_high, + eps, + noise_variance, + decay=NoDecay() + ): + super(GaussianNoiseStrategy, self).__init__(decay) + self.eps = eps + self.cur_eps = eps + + self._action_dim = action_dim + self._action_low = action_low + self._action_high = action_high + + self._noise_variance = noise_variance + + def select_action(self, values): + noise = np.random.normal(loc=0.0, scale=self._noise_variance, size=self._action_dim) + action = values + self.cur_eps * noise + return np.clip(action, self._action_low, self._action_high) diff --git a/src/test/resources/target_code/ros-ddpg/reinforcement_learning/util.py b/src/test/resources/target_code/ros-ddpg/reinforcement_learning/util.py index 78e7795fd290f99980c689ad1c76f274a821f830..1d8b38c4a32a1cc6c0e364054e746e9115d7d05b 100644 --- a/src/test/resources/target_code/ros-ddpg/reinforcement_learning/util.py +++ b/src/test/resources/target_code/ros-ddpg/reinforcement_learning/util.py @@ -11,8 +11,8 @@ import cnnarch_logger LOSS_FUNCTIONS = { 'l1': gluon.loss.L1Loss(), - 'euclidean': gluon.loss.L2Loss(), - 'huber_loss': gluon.loss.HuberLoss(), + 'l2': gluon.loss.L2Loss(), + 'huber': gluon.loss.HuberLoss(), 'softmax_cross_entropy': gluon.loss.SoftmaxCrossEntropyLoss(), 'sigmoid_cross_entropy': gluon.loss.SigmoidBinaryCrossEntropyLoss()} diff --git a/src/test/resources/target_code/td3/CNNTrainer_tD3Config.py b/src/test/resources/target_code/td3/CNNTrainer_tD3Config.py new file mode 100644 index 0000000000000000000000000000000000000000..4b184a417c139d97b7e6e867b52647ba872a0d3d --- /dev/null +++ b/src/test/resources/target_code/td3/CNNTrainer_tD3Config.py @@ -0,0 +1,121 @@ +from reinforcement_learning.agent import TwinDelayedDdpgAgent +from reinforcement_learning.util import AgentSignalHandler +from reinforcement_learning.cnnarch_logger import ArchLogger +from reinforcement_learning.CNNCreator_CriticNetwork import CNNCreator_CriticNetwork +import reinforcement_learning.environment +import CNNCreator_tD3Config + +import os +import sys +import re +import time +import numpy as np +import mxnet as mx + + +def resume_session(sessions_dir): + resume_session = False + resume_directory = None + if os.path.isdir(sessions_dir): + regex = re.compile(r'\d\d\d\d-\d\d-\d\d-\d\d-\d\d') + dir_content = os.listdir(sessions_dir) + session_files = filter(regex.search, dir_content) + session_files.sort(reverse=True) + for d in session_files: + interrupted_session_dir = os.path.join(sessions_dir, d, '.interrupted_session') + if os.path.isdir(interrupted_session_dir): + resume = raw_input('Interrupted session from {} found. Do you want to resume? (y/n) '.format(d)) + if resume == 'y': + resume_session = True + resume_directory = interrupted_session_dir + break + return resume_session, resume_directory + + +if __name__ == "__main__": + agent_name = 'tD3Config' + # Prepare output directory and logger + all_output_dir = os.path.join('model', agent_name) + output_directory = os.path.join( + all_output_dir, + time.strftime('%Y-%m-%d-%H-%M-%S', + time.localtime(time.time()))) + ArchLogger.set_output_directory(output_directory) + ArchLogger.set_logger_name(agent_name) + ArchLogger.set_output_level(ArchLogger.INFO) + + env = reinforcement_learning.environment.GymEnvironment('CartPole-v1') + + context = mx.cpu() + actor_creator = CNNCreator_tD3Config.CNNCreator_tD3Config() + actor_creator.construct(context) + critic_creator = CNNCreator_CriticNetwork() + critic_creator.construct(context) + + agent_params = { + 'environment': env, + 'replay_memory_params': { + 'method': 'online', + 'state_dtype': 'float32', + 'action_dtype': 'float32', + 'rewards_dtype': 'float32' + }, + 'strategy_params': { + 'method':'gaussian', + 'epsilon': 1, + 'min_epsilon': 0.001, + 'epsilon_decay_method': 'linear', + 'epsilon_decay': 0.0001, + 'epsilon_decay_start': 50, + 'epsilon_decay_per_step': True, + 'noise_variance': 0.3, + 'action_low': -1, + 'action_high': 1, + }, + 'agent_name': agent_name, + 'verbose': True, + 'output_directory': output_directory, + 'state_dim': (8,), + 'action_dim': (3,), + 'actor': actor_creator.networks[0], + 'critic': critic_creator.networks[0], + 'soft_target_update_rate': 0.001, + 'actor_optimizer': 'adam', + 'actor_optimizer_params': { + 'learning_rate_minimum': 5.0E-5, + 'learning_rate_policy': 'step', + 'learning_rate': 1.0E-4, + 'learning_rate_decay': 0.9}, + 'critic_optimizer': 'rmsprop', + 'critic_optimizer_params': { + 'learning_rate_minimum': 1.0E-4, + 'learning_rate_policy': 'step', + 'learning_rate': 0.001, + 'learning_rate_decay': 0.5}, + 'policy_noise': 0.1, + 'noise_clip': 0.8, + 'policy_delay': 4, + } + + resume, resume_directory = resume_session(all_output_dir) + + if resume: + output_directory, _ = os.path.split(resume_directory) + ArchLogger.set_output_directory(output_directory) + resume_agent_params = { + 'session_dir': resume_directory, + 'environment': env, + 'actor': actor_creator.networks[0], + 'critic': critic_creator.networks[0] + } + agent = TwinDelayedDdpgAgent.resume_from_session(**resume_agent_params) + else: + agent = TwinDelayedDdpgAgent(**agent_params) + + signal_handler = AgentSignalHandler() + signal_handler.register_agent(agent) + + train_successful = agent.train() + + if train_successful: + agent.export_best_network(path=actor_creator._model_dir_ + actor_creator._model_prefix_ + '_0_newest', epoch=0) diff --git a/src/test/resources/target_code/td3/reinforcement_learning/CNNCreator_CriticNetwork.py b/src/test/resources/target_code/td3/reinforcement_learning/CNNCreator_CriticNetwork.py new file mode 100644 index 0000000000000000000000000000000000000000..e4bc60aef16f924740118f8b422ea3e59b692227 --- /dev/null +++ b/src/test/resources/target_code/td3/reinforcement_learning/CNNCreator_CriticNetwork.py @@ -0,0 +1,59 @@ +import mxnet as mx +import logging +import os +from CNNNet_CriticNetwork import Net_0 + +class CNNCreator_CriticNetwork: + _model_dir_ = "model/CriticNetwork/" + _model_prefix_ = "model" + + def __init__(self): + self.weight_initializer = mx.init.Normal() + self.networks = {} + + def load(self, context): + earliestLastEpoch = None + + for i, network in self.networks.items(): + lastEpoch = 0 + param_file = None + + try: + os.remove(self._model_dir_ + self._model_prefix_ + "_" + str(i) + "_newest-0000.params") + except OSError: + pass + try: + os.remove(self._model_dir_ + self._model_prefix_ + "_" + str(i) + "_newest-symbol.json") + except OSError: + pass + + if os.path.isdir(self._model_dir_): + for file in os.listdir(self._model_dir_): + if ".params" in file and self._model_prefix_ + "_" + str(i) in file: + epochStr = file.replace(".params","").replace(self._model_prefix_ + "_" + str(i) + "-","") + epoch = int(epochStr) + if epoch > lastEpoch: + lastEpoch = epoch + param_file = file + if param_file is None: + earliestLastEpoch = 0 + else: + logging.info("Loading checkpoint: " + param_file) + network.load_parameters(self._model_dir_ + param_file) + + if earliestLastEpoch == None or lastEpoch < earliestLastEpoch: + earliestLastEpoch = lastEpoch + + return earliestLastEpoch + + def construct(self, context, data_mean=None, data_std=None): + self.networks[0] = Net_0(data_mean=data_mean, data_std=data_std) + self.networks[0].collect_params().initialize(self.weight_initializer, ctx=context) + self.networks[0].hybridize() + self.networks[0](mx.nd.zeros((1, 8,), ctx=context), mx.nd.zeros((1, 3,), ctx=context)) + + if not os.path.exists(self._model_dir_): + os.makedirs(self._model_dir_) + + for i, network in self.networks.items(): + network.export(self._model_dir_ + self._model_prefix_ + "_" + str(i), epoch=0) diff --git a/src/test/resources/target_code/td3/reinforcement_learning/CNNNet_CriticNetwork.py b/src/test/resources/target_code/td3/reinforcement_learning/CNNNet_CriticNetwork.py new file mode 100644 index 0000000000000000000000000000000000000000..33b3164f7ac15f37762f961fb4d8150901701a38 --- /dev/null +++ b/src/test/resources/target_code/td3/reinforcement_learning/CNNNet_CriticNetwork.py @@ -0,0 +1,133 @@ +import mxnet as mx +import numpy as np +from mxnet import gluon + +class OneHot(gluon.HybridBlock): + def __init__(self, size, **kwargs): + super(OneHot, self).__init__(**kwargs) + with self.name_scope(): + self.size = size + + def hybrid_forward(self, F, x): + return F.one_hot(indices=F.argmax(data=x, axis=1), depth=self.size) + + +class Softmax(gluon.HybridBlock): + def __init__(self, **kwargs): + super(Softmax, self).__init__(**kwargs) + + def hybrid_forward(self, F, x): + return F.softmax(x) + + +class Split(gluon.HybridBlock): + def __init__(self, num_outputs, axis=1, **kwargs): + super(Split, self).__init__(**kwargs) + with self.name_scope(): + self.axis = axis + self.num_outputs = num_outputs + + def hybrid_forward(self, F, x): + return F.split(data=x, axis=self.axis, num_outputs=self.num_outputs) + + +class Concatenate(gluon.HybridBlock): + def __init__(self, dim=1, **kwargs): + super(Concatenate, self).__init__(**kwargs) + with self.name_scope(): + self.dim = dim + + def hybrid_forward(self, F, *x): + return F.concat(*x, dim=self.dim) + + +class ZScoreNormalization(gluon.HybridBlock): + def __init__(self, data_mean, data_std, **kwargs): + super(ZScoreNormalization, self).__init__(**kwargs) + with self.name_scope(): + self.data_mean = self.params.get('data_mean', shape=data_mean.shape, + init=mx.init.Constant(data_mean.asnumpy().tolist()), differentiable=False) + self.data_std = self.params.get('data_std', shape=data_mean.shape, + init=mx.init.Constant(data_std.asnumpy().tolist()), differentiable=False) + + def hybrid_forward(self, F, x, data_mean, data_std): + x = F.broadcast_sub(x, data_mean) + x = F.broadcast_div(x, data_std) + return x + + +class Padding(gluon.HybridBlock): + def __init__(self, padding, **kwargs): + super(Padding, self).__init__(**kwargs) + with self.name_scope(): + self.pad_width = padding + + def hybrid_forward(self, F, x): + x = F.pad(data=x, + mode='constant', + pad_width=self.pad_width, + constant_value=0) + return x + + +class NoNormalization(gluon.HybridBlock): + def __init__(self, **kwargs): + super(NoNormalization, self).__init__(**kwargs) + + def hybrid_forward(self, F, x): + return x + + +class Net_0(gluon.HybridBlock): + def __init__(self, data_mean=None, data_std=None, **kwargs): + super(Net_0, self).__init__(**kwargs) + self.last_layers = {} + with self.name_scope(): + if data_mean: + assert(data_std) + self.input_normalization_state = ZScoreNormalization(data_mean=data_mean['state'], + data_std=data_std['state']) + else: + self.input_normalization_state = NoNormalization() + + self.fc2_1_ = gluon.nn.Dense(units=300, use_bias=True) + # fc2_1_, output shape: {[300,1,1]} + + self.relu2_1_ = gluon.nn.Activation(activation='relu') + self.fc3_1_ = gluon.nn.Dense(units=600, use_bias=True) + # fc3_1_, output shape: {[600,1,1]} + + if data_mean: + assert(data_std) + self.input_normalization_action = ZScoreNormalization(data_mean=data_mean['action'], + data_std=data_std['action']) + else: + self.input_normalization_action = NoNormalization() + + self.fc2_2_ = gluon.nn.Dense(units=600, use_bias=True) + # fc2_2_, output shape: {[600,1,1]} + + self.fc4_ = gluon.nn.Dense(units=600, use_bias=True) + # fc4_, output shape: {[600,1,1]} + + self.relu4_ = gluon.nn.Activation(activation='relu') + self.fc5_ = gluon.nn.Dense(units=1, use_bias=True) + # fc5_, output shape: {[1,1,1]} + + + + def hybrid_forward(self, F, state, action): + outputs = [] + state = self.input_normalization_state(state) + fc2_1_ = self.fc2_1_(state) + relu2_1_ = self.relu2_1_(fc2_1_) + fc3_1_ = self.fc3_1_(relu2_1_) + action = self.input_normalization_action(action) + fc2_2_ = self.fc2_2_(action) + add4_ = fc3_1_ + fc2_2_ + fc4_ = self.fc4_(add4_) + relu4_ = self.relu4_(fc4_) + fc5_ = self.fc5_(relu4_) + outputs.append(fc5_) + + return outputs[0] diff --git a/src/test/resources/target_code/td3/reinforcement_learning/__init__.py b/src/test/resources/target_code/td3/reinforcement_learning/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/test/resources/target_code/td3/reinforcement_learning/agent.py b/src/test/resources/target_code/td3/reinforcement_learning/agent.py new file mode 100644 index 0000000000000000000000000000000000000000..148c89848b8d990b5f635c3976fd15e71a608893 --- /dev/null +++ b/src/test/resources/target_code/td3/reinforcement_learning/agent.py @@ -0,0 +1,1269 @@ +import mxnet as mx +import numpy as np +import time +import os +import sys +import util +import matplotlib.pyplot as plt +import pyprind +from cnnarch_logger import ArchLogger +from replay_memory import ReplayMemoryBuilder +from strategy import StrategyBuilder +from util import copy_net, get_loss_function,\ + copy_net_with_two_inputs, DdpgTrainingStats, DqnTrainingStats,\ + make_directory_if_not_exist +from mxnet import nd, gluon, autograd + + +class Agent(object): + def __init__( + self, + environment, + replay_memory_params, + strategy_params, + state_dim, + action_dim, + ctx=None, + discount_factor=.9, + training_episodes=50, + train_interval=1, + start_training=0, + snapshot_interval=200, + agent_name='Agent', + max_episode_step=99999, + evaluation_samples=1000, + output_directory='model_parameters', + verbose=True, + target_score=None + ): + assert 0 < discount_factor <= 1,\ + 'Discount factor must be between 0 and 1' + assert train_interval > 0, 'Train interval must be greater 0' + assert snapshot_interval > 0, 'Snapshot interval must be greater 0' + assert max_episode_step > 0,\ + 'Maximal steps per episode must be greater 0' + assert training_episodes > 0, 'Trainings episode must be greater 0' + assert replay_memory_params is not None,\ + 'Replay memory parameter not set' + assert type(state_dim) is tuple, 'State dimension is not a tuple' + assert type(action_dim) is tuple, 'Action dimension is not a tuple' + + self._logger = ArchLogger.get_logger() + self._ctx = mx.gpu() if ctx == 'gpu' else mx.cpu() + self._environment = environment + self._discount_factor = discount_factor + self._training_episodes = training_episodes + self._train_interval = train_interval + self._verbose = verbose + self._state_dim = state_dim + + replay_memory_params['state_dim'] = state_dim + replay_memory_params['action_dim'] = action_dim + self._replay_memory_params = replay_memory_params + rm_builder = ReplayMemoryBuilder() + self._memory = rm_builder.build_by_params(**replay_memory_params) + self._minibatch_size = self._memory.sample_size + self._action_dim = action_dim + + strategy_params['action_dim'] = self._action_dim + self._strategy_params = strategy_params + strategy_builder = StrategyBuilder() + self._strategy = strategy_builder.build_by_params(**strategy_params) + self._agent_name = agent_name + self._snapshot_interval = snapshot_interval + self._creation_time = time.time() + self._max_episode_step = max_episode_step + self._start_training = start_training + self._output_directory = output_directory + self._target_score = target_score + + self._evaluation_samples = evaluation_samples + self._best_avg_score = -np.infty + self._best_net = None + + self._interrupt_flag = False + self._training_stats = None + + # Training Context + self._current_episode = 0 + self._total_steps = 0 + + @property + def current_episode(self): + return self._current_episode + + @property + def environment(self): + return self._environment + + def save_config_file(self): + import json + make_directory_if_not_exist(self._output_directory) + filename = os.path.join(self._output_directory, 'config.json') + config = self._make_config_dict() + with open(filename, mode='w') as fp: + json.dump(config, fp, indent=4) + + def set_interrupt_flag(self, interrupt): + self._interrupt_flag = interrupt + + def _interrupt_training(self): + import pickle + self._logger.info('Training interrupted; Store state for resuming') + session_dir = self._get_session_dir() + agent_session_file = os.path.join(session_dir, 'agent.p') + logger = self._logger + + self._training_stats.save_stats(self._output_directory, episode=self._current_episode) + + self._make_pickle_ready(session_dir) + + with open(agent_session_file, 'wb') as f: + pickle.dump(self, f, protocol=2) + logger.info('State successfully stored') + + def _make_pickle_ready(self, session_dir): + del self._training_stats.logger + self._environment.close() + self._environment = None + self._export_net(self._best_net, 'best_net', filedir=session_dir) + self._logger = None + self._best_net = None + + def _make_config_dict(self): + config = dict() + config['state_dim'] = self._state_dim + config['action_dim'] = self._action_dim + config['ctx'] = str(self._ctx) + config['discount_factor'] = self._discount_factor + config['strategy_params'] = self._strategy_params + config['replay_memory_params'] = self._replay_memory_params + config['training_episodes'] = self._training_episodes + config['start_training'] = self._start_training + config['evaluation_samples'] = self._evaluation_samples + config['train_interval'] = self._train_interval + config['snapshot_interval'] = self._snapshot_interval + config['agent_name'] = self._agent_name + config['max_episode_step'] = self._max_episode_step + config['output_directory'] = self._output_directory + config['verbose'] = self._verbose + config['target_score'] = self._target_score + return config + + def _adjust_optimizer_params(self, optimizer_params): + if 'weight_decay' in optimizer_params: + optimizer_params['wd'] = optimizer_params['weight_decay'] + del optimizer_params['weight_decay'] + if 'learning_rate_decay' in optimizer_params: + min_learning_rate = 1e-8 + if 'learning_rate_minimum' in optimizer_params: + min_learning_rate = optimizer_params['learning_rate_minimum'] + del optimizer_params['learning_rate_minimum'] + optimizer_params['lr_scheduler'] = mx.lr_scheduler.FactorScheduler( + optimizer_params['step_size'], + factor=optimizer_params['learning_rate_decay'], + stop_factor_lr=min_learning_rate) + del optimizer_params['step_size'] + del optimizer_params['learning_rate_decay'] + + return optimizer_params + + def _sample_from_memory(self): + states, actions, rewards, next_states, terminals\ + = self._memory.sample(batch_size=self._minibatch_size) + states = nd.array(states, ctx=self._ctx) + actions = nd.array(actions, ctx=self._ctx) + rewards = nd.array(rewards, ctx=self._ctx) + next_states = nd.array(next_states, ctx=self._ctx) + terminals = nd.array(terminals, ctx=self._ctx) + return states, actions, rewards, next_states, terminals + + def evaluate(self, target=None, sample_games=100, verbose=True): + if sample_games <= 0: + return 0 + + target = self._target_score if target is None else target + if target: + target_achieved = 0 + total_reward = 0 + + self._logger.info('Sampling from {} games...'.format(sample_games)) + for g in pyprind.prog_bar(range(sample_games)): + state = self._environment.reset() + step = 0 + game_reward = 0 + terminal = False + while not terminal and (step < self._max_episode_step): + action = self.get_next_action(state) + state, reward, terminal, _ = self._environment.step(action) + game_reward += reward + step += 1 + + if verbose: + info = 'Game %d: Reward %f' % (g, game_reward) + self._logger.debug(info) + if target: + if game_reward >= target: + target_achieved += 1 + total_reward += game_reward + + avg_reward = float(total_reward)/float(sample_games) + info = 'Avg. Reward: %f' % avg_reward + if target: + target_achieved_ratio = int( + (float(target_achieved)/float(sample_games))*100) + info += '; Target Achieved in %d%% of games'\ + % (target_achieved_ratio) + + if verbose: + self._logger.info(info) + return avg_reward + + def _do_snapshot_if_in_interval(self, episode): + do_snapshot =\ + (episode != 0 and (episode % self._snapshot_interval == 0)) + if do_snapshot: + self.save_parameters(episode=episode) + self._evaluate() + + def _evaluate(self, verbose=True): + avg_reward = self.evaluate( + sample_games=self._evaluation_samples, verbose=False) + info = 'Evaluation -> Average Reward in {} games: {}'.format( + self._evaluation_samples, avg_reward) + + if self._best_avg_score is None or self._best_avg_score <= avg_reward: + self._save_current_as_best_net() + self._best_avg_score = avg_reward + if verbose: + self._logger.info(info) + + def _is_target_reached(self, avg_reward): + return self._target_score is not None\ + and avg_reward > self._target_score + + def _do_training(self): + return (self._total_steps % self._train_interval == 0) and\ + (self._memory.is_sample_possible(self._minibatch_size)) and\ + (self._current_episode >= self._start_training) + + def _check_interrupt_routine(self): + if self._interrupt_flag: + self._interrupt_flag = False + self._interrupt_training() + return True + return False + + def _is_target_reached(self, avg_reward): + return self._target_score is not None\ + and avg_reward > self._target_score + + def _export_net(self, net, filename, filedir=None, episode=None): + assert self._output_directory + assert isinstance(net, gluon.HybridBlock) + make_directory_if_not_exist(self._output_directory) + filedir = self._output_directory if filedir is None else filedir + filename = os.path.join(filedir, filename) + + if episode is not None: + filename = filename + '-ep{}'.format(episode) + + net.export(filename, epoch=0) + net.save_parameters(filename + '.params') + + def export_best_network(self, path=None, epoch=0): + path = os.path.join(self._output_directory, 'best_network')\ + if path is None else path + self._logger.info( + 'Saving best network with average reward of {}'.format( + self._best_avg_score)) + self._best_net.export(path, epoch=epoch) + + def _get_session_dir(self): + session_dir = os.path.join( + self._output_directory, '.interrupted_session') + make_directory_if_not_exist(session_dir) + return session_dir + + def _save_current_as_best_net(self): + raise NotImplementedError + + def get_next_action(self, state): + raise NotImplementedError + + def save_parameters(self, episode): + raise NotImplementedError + + def train(self, episodes=None): + raise NotImplementedError + + +class DdpgAgent(Agent): + def __init__( + self, + actor, + critic, + environment, + replay_memory_params, + strategy_params, + state_dim, + action_dim, + soft_target_update_rate=.001, + actor_optimizer='adam', + actor_optimizer_params={'learning_rate': 0.0001}, + critic_optimizer='adam', + critic_optimizer_params={'learning_rate': 0.001}, + ctx=None, + discount_factor=.9, + training_episodes=50, + start_training=20, + train_interval=1, + snapshot_interval=200, + agent_name='DdpgAgent', + max_episode_step=9999, + evaluation_samples=100, + output_directory='model_parameters', + verbose=True, + target_score=None + ): + super(DdpgAgent, self).__init__( + environment=environment, replay_memory_params=replay_memory_params, + strategy_params=strategy_params, state_dim=state_dim, + action_dim=action_dim, ctx=ctx, discount_factor=discount_factor, + training_episodes=training_episodes, start_training=start_training, + train_interval=train_interval, + snapshot_interval=snapshot_interval, agent_name=agent_name, + max_episode_step=max_episode_step, + output_directory=output_directory, verbose=verbose, + target_score=target_score, evaluation_samples=evaluation_samples) + assert critic is not None, 'Critic not set' + assert actor is not None, 'Actor is not set' + assert soft_target_update_rate > 0,\ + 'Target update must be greater zero' + assert actor_optimizer is not None, 'No actor optimizer set' + assert critic_optimizer is not None, 'No critic optimizer set' + + self._actor = actor + self._critic = critic + + self._actor_target = self._copy_actor() + self._critic_target = self._copy_critic() + + self._actor_optimizer = actor_optimizer + self._actor_optimizer_params = self._adjust_optimizer_params( + actor_optimizer_params) + + self._critic_optimizer = critic_optimizer + self._critic_optimizer_params = self._adjust_optimizer_params( + critic_optimizer_params) + + self._soft_target_update_rate = soft_target_update_rate + + self._logger.info( + 'Agent created with following parameters: {}'.format( + self._make_config_dict())) + + self._best_net = self._copy_actor() + + self._training_stats = DdpgTrainingStats(self._training_episodes) + + def _make_pickle_ready(self, session_dir): + super(DdpgAgent, self)._make_pickle_ready(session_dir) + self._export_net(self._actor, 'current_actor') + + self._export_net(self._actor, 'actor', filedir=session_dir) + self._actor = None + self._export_net(self._critic, 'critic', filedir=session_dir) + self._critic = None + self._export_net( + self._actor_target, 'actor_target', filedir=session_dir) + self._actor_target = None + self._export_net( + self._critic_target, 'critic_target', filedir=session_dir) + self._critic_target = None + + @classmethod + def resume_from_session(cls, session_dir, actor, critic, environment): + import pickle + if not os.path.exists(session_dir): + raise ValueError('Session directory does not exist') + + files = dict() + files['agent'] = os.path.join(session_dir, 'agent.p') + files['best_net_params'] = os.path.join(session_dir, 'best_net.params') + files['actor_net_params'] = os.path.join(session_dir, 'actor.params') + files['actor_target_net_params'] = os.path.join( + session_dir, 'actor_target.params') + files['critic_net_params'] = os.path.join(session_dir, 'critic.params') + files['critic_target_net_params'] = os.path.join( + session_dir, 'critic_target.params') + + for file in files.values(): + if not os.path.exists(file): + raise ValueError( + 'Session directory is not complete: {} is missing' + .format(file)) + + with open(files['agent'], 'rb') as f: + agent = pickle.load(f) + + agent._environment = environment + + agent._actor = actor + agent._actor.load_parameters(files['actor_net_params'], agent._ctx) + agent._actor.hybridize() + agent._actor(nd.random_normal( + shape=((1,) + agent._state_dim), ctx=agent._ctx)) + + agent._best_net = copy_net(agent._actor, agent._state_dim, agent._ctx) + agent._best_net.load_parameters(files['best_net_params'], agent._ctx) + + agent._actor_target = copy_net( + agent._actor, agent._state_dim, agent._ctx) + agent._actor_target.load_parameters(files['actor_target_net_params']) + + agent._critic = critic + agent._critic.load_parameters(files['critic_net_params'], agent._ctx) + agent._critic.hybridize() + agent._critic( + nd.random_normal(shape=((1,) + agent._state_dim), ctx=agent._ctx), + nd.random_normal(shape=((1,) + agent._action_dim), ctx=agent._ctx)) + + agent._critic_target = copy_net_with_two_inputs( + agent._critic, agent._state_dim, agent._action_dim, agent._ctx) + agent._critic_target.load_parameters(files['critic_target_net_params']) + + agent._logger = ArchLogger.get_logger() + agent._training_stats.logger = ArchLogger.get_logger() + agent._logger.info('Agent was retrieved; Training can be continued') + + return agent + + def _save_current_as_best_net(self): + self._best_net = self._copy_actor() + + def get_next_action(self, state): + action = self._actor(nd.array([state], ctx=self._ctx)) + return action[0].asnumpy() + + def save_parameters(self, episode): + self._export_net( + self._actor, self._agent_name + '_actor', episode=episode) + + def train(self, episodes=None): + self.save_config_file() + self._logger.info("--- Start DDPG training ---") + episodes = \ + episodes if episodes is not None else self._training_episodes + + resume = (self._current_episode > 0) + if resume: + self._logger.info("Training session resumed") + self._logger.info( + "Starting from episode {}".format(self._current_episode)) + else: + self._training_stats = DdpgTrainingStats(episodes) + + # Initialize target Q' and mu' + self._actor_target = self._copy_actor() + self._critic_target = self._copy_critic() + + # Initialize l2 loss for critic network + l2_loss = gluon.loss.L2Loss() + + # Initialize critic and actor trainer + trainer_actor = gluon.Trainer( + self._actor.collect_params(), self._actor_optimizer, + self._actor_optimizer_params) + trainer_critic = gluon.Trainer( + self._critic.collect_params(), self._critic_optimizer, + self._critic_optimizer_params) + + # For episode=1..n + while self._current_episode < episodes: + # Check interrupt flag + if self._check_interrupt_routine(): + return False + + # Initialize new episode + step = 0 + episode_reward = 0 + start = time.time() + episode_critic_loss = 0 + episode_actor_loss = 0 + episode_avg_q_value = 0 + training_steps = 0 + + # Get initialial observation state s + state = self._environment.reset() + + # For step=1..T + while step < self._max_episode_step: + # Select an action a = mu(s) + N(step) according to current + # actor and exploration noise N according to strategy + action = self._strategy.select_action( + self.get_next_action(state)) + self._strategy.decay(self._current_episode) + + # Execute action a and observe reward r and next state ns + next_state, reward, terminal, _ = \ + self._environment.step(action) + + self._logger.debug( + 'Applied action {} with reward {}'.format(action, reward)) + + # Store transition (s,a,r,ns) in replay buffer + self._memory.append( + state, action, reward, next_state, terminal) + + if self._do_training(): + # Sample random minibatch of b transitions + # (s_i, a_i, r_i, s_(i+1)) from replay buffer + states, actions, rewards, next_states, terminals =\ + self._sample_from_memory() + + actor_target_actions = self._actor_target(next_states) + critic_target_qvalues = self._critic_target( + next_states, actor_target_actions) + + rewards = rewards.reshape(self._minibatch_size, 1) + terminals = terminals.reshape(self._minibatch_size, 1) + + # y = r_i + discount * Q'(s_(i+1), mu'(s_(i+1))) + y = rewards + (1.0 - terminals) * self._discount_factor\ + * critic_target_qvalues + + # Train the critic network + with autograd.record(): + qvalues = self._critic(states, actions) + critic_loss = l2_loss(qvalues, y) + critic_loss.backward() + trainer_critic.step(self._minibatch_size) + + # Train the actor network + # Temporary critic so that gluon trainer does not mess + # with critic parameters + tmp_critic = self._copy_critic() + with autograd.record(): + # For maximizing qvalues we have to multiply with -1 + # as we use a minimizer + actor_loss = -tmp_critic( + states, self._actor(states)).mean() + actor_loss.backward() + trainer_actor.step(self._minibatch_size) + + # Update target networks: + self._actor_target = self._soft_update( + self._actor, self._actor_target, + self._soft_target_update_rate) + self._critic_target = self._soft_update( + self._critic, self._critic_target, + self._soft_target_update_rate) + + # Update statistics + episode_critic_loss +=\ + np.sum(critic_loss.asnumpy()) / self._minibatch_size + episode_actor_loss +=\ + np.sum(actor_loss.asnumpy()) / self._minibatch_size + episode_avg_q_value +=\ + np.sum(qvalues.asnumpy()) / self._minibatch_size + + training_steps += 1 + + episode_reward += reward + step += 1 + self._total_steps += 1 + state = next_state + + if terminal: + # Reset the strategy + self._strategy.reset() + break + + # Log the episode results + episode_actor_loss = 0 if training_steps == 0\ + else (episode_actor_loss / training_steps) + episode_critic_loss = 0 if training_steps == 0\ + else (episode_critic_loss / training_steps) + episode_avg_q_value = 0 if training_steps == 0\ + else (episode_avg_q_value / training_steps) + + avg_reward = self._training_stats.log_episode( + self._current_episode, start, training_steps, + episode_actor_loss, episode_critic_loss, episode_avg_q_value, + self._strategy.cur_eps, episode_reward) + + self._do_snapshot_if_in_interval(self._current_episode) + + if self._is_target_reached(avg_reward): + self._logger.info( + 'Target score is reached in average; Training is stopped') + break + + self._current_episode += 1 + + self._evaluate() + self.save_parameters(episode=self._current_episode) + self.export_best_network() + self._training_stats.save_stats(self._output_directory) + self._logger.info('--------- Training finished ---------') + return True + + def _make_config_dict(self): + config = super(DdpgAgent, self)._make_config_dict() + config['soft_target_update_rate'] = self._soft_target_update_rate + config['actor_optimizer'] = self._actor_optimizer + config['actor_optimizer_params'] = self._actor_optimizer_params + config['critic_optimizer'] = self._critic_optimizer + config['critic_optimizer_params'] = self._critic_optimizer_params + return config + + def _soft_update(self, net, target, tau): + net_params = [p.data() for _, p in net.collect_params().items()] + for i, (_, p) in enumerate(target.collect_params().items()): + target_params = p.data() + p.set_data((1.0 - tau) * target_params + tau * net_params[i]) + return target + + def _copy_actor(self): + assert self._actor is not None + assert self._ctx is not None + assert type(self._state_dim) is tuple + return copy_net(self._actor, self._state_dim, ctx=self._ctx) + + def _copy_critic(self): + assert self._critic is not None + assert self._ctx is not None + assert type(self._state_dim) is tuple + assert type(self._action_dim) is tuple + return copy_net_with_two_inputs( + self._critic, self._state_dim, self._action_dim, ctx=self._ctx) + + +class TwinDelayedDdpgAgent(DdpgAgent): + def __init__( + self, + actor, + critic, + environment, + replay_memory_params, + strategy_params, + state_dim, + action_dim, + soft_target_update_rate=.001, + actor_optimizer='adam', + actor_optimizer_params={'learning_rate': 0.0001}, + critic_optimizer='adam', + critic_optimizer_params={'learning_rate': 0.001}, + ctx=None, + discount_factor=.9, + training_episodes=50, + start_training=20, + train_interval=1, + snapshot_interval=200, + agent_name='DdpgAgent', + max_episode_step=9999, + evaluation_samples=100, + output_directory='model_parameters', + verbose=True, + target_score=None, + policy_noise=0.2, + noise_clip=0.5, + policy_delay=2 + ): + super(TwinDelayedDdpgAgent, self).__init__( + environment=environment, replay_memory_params=replay_memory_params, + strategy_params=strategy_params, state_dim=state_dim, + action_dim=action_dim, ctx=ctx, discount_factor=discount_factor, + training_episodes=training_episodes, start_training=start_training, + train_interval=train_interval, + snapshot_interval=snapshot_interval, agent_name=agent_name, + max_episode_step=max_episode_step, + output_directory=output_directory, verbose=verbose, + target_score=target_score, evaluation_samples=evaluation_samples, + critic=critic, soft_target_update_rate=soft_target_update_rate, + actor=actor, actor_optimizer=actor_optimizer, + actor_optimizer_params=actor_optimizer_params, + critic_optimizer=critic_optimizer, + critic_optimizer_params=critic_optimizer_params) + + self._policy_noise = policy_noise + self._noise_clip = noise_clip + self._policy_delay = policy_delay + + self._critic2 = self._critic.__class__() + self._critic2.collect_params().initialize( + mx.init.Normal(), ctx=self._ctx) + self._critic2.hybridize() + self._critic2(nd.ones((1,) + state_dim, ctx=self._ctx), + nd.ones((1,) + action_dim, ctx=self._ctx)) + + self._critic2_target = self._copy_critic2() + + self._critic2_optimizer = critic_optimizer + self._critic2_optimizer_params = self._adjust_optimizer_params( + critic_optimizer_params) + + def _make_pickle_ready(self, session_dir): + super(TwinDelayedDdpgAgent, self)._make_pickle_ready(session_dir) + self._export_net(self._critic2, 'critic2', filedir=session_dir) + self._critic2 = None + self._export_net( + self._critic2_target, 'critic2_target', filedir=session_dir) + self._critic2_target = None + + @classmethod + def resume_from_session(cls, session_dir, actor, critic, environment): + import pickle + if not os.path.exists(session_dir): + raise ValueError('Session directory does not exist') + + files = dict() + files['agent'] = os.path.join(session_dir, 'agent.p') + files['best_net_params'] = os.path.join(session_dir, 'best_net.params') + files['actor_net_params'] = os.path.join(session_dir, 'actor.params') + files['actor_target_net_params'] = os.path.join( + session_dir, 'actor_target.params') + files['critic_net_params'] = os.path.join(session_dir, 'critic.params') + files['critic_target_net_params'] = os.path.join( + session_dir, 'critic_target.params') + files['critic2_net_params'] = os.path.join( + session_dir, 'critic2.params') + files['critic2_target_net_params'] = os.path.join( + session_dir, 'critic2_target.params') + + for file in files.values(): + if not os.path.exists(file): + raise ValueError( + 'Session directory is not complete: {} is missing' + .format(file)) + + with open(files['agent'], 'rb') as f: + agent = pickle.load(f) + + agent._environment = environment + + agent._actor = actor + agent._actor.load_parameters(files['actor_net_params'], agent._ctx) + agent._actor.hybridize() + agent._actor(nd.random_normal( + shape=((1,) + agent._state_dim), ctx=agent._ctx)) + + agent._best_net = copy_net(agent._actor, agent._state_dim, agent._ctx) + agent._best_net.load_parameters(files['best_net_params'], agent._ctx) + + agent._actor_target = copy_net( + agent._actor, agent._state_dim, agent._ctx) + agent._actor_target.load_parameters(files['actor_target_net_params']) + + agent._critic = critic + agent._critic.load_parameters(files['critic_net_params'], agent._ctx) + agent._critic.hybridize() + agent._critic( + nd.random_normal(shape=((1,) + agent._state_dim), ctx=agent._ctx), + nd.random_normal(shape=((1,) + agent._action_dim), ctx=agent._ctx)) + + agent._critic_target = copy_net_with_two_inputs( + agent._critic, agent._state_dim, agent._action_dim, agent._ctx) + agent._critic_target.load_parameters(files['critic_target_net_params']) + + agent._critic2 = copy_net_with_two_inputs( + agent._critic, agent._state_dim, agent._action_dim, agent._ctx) + agent._critic2.load_parameters(files['critic2_net_params'], agent._ctx) + agent._critic2.hybridize() + agent._critic2( + nd.random_normal(shape=((1,) + agent._state_dim), ctx=agent._ctx), + nd.random_normal(shape=((1,) + agent._action_dim), ctx=agent._ctx)) + + agent._critic2_target = copy_net_with_two_inputs( + agent._critic2, agent._state_dim, agent._action_dim, agent._ctx) + agent._critic2_target.load_parameters( + files['critic2_target_net_params']) + + agent._logger = ArchLogger.get_logger() + agent._training_stats.logger = ArchLogger.get_logger() + agent._logger.info('Agent was retrieved; Training can be continued') + + return agent + + def _copy_critic2(self): + assert self._critic2 is not None + assert self._ctx is not None + assert type(self._state_dim) is tuple + assert type(self._action_dim) is tuple + return copy_net_with_two_inputs( + self._critic2, self._state_dim, self._action_dim, ctx=self._ctx) + + def train(self, episodes=None): + self.save_config_file() + self._logger.info("--- Start TwinDelayedDDPG training ---") + episodes = \ + episodes if episodes is not None else self._training_episodes + + resume = (self._current_episode > 0) + if resume: + self._logger.info("Training session resumed") + self._logger.info( + "Starting from episode {}".format(self._current_episode)) + else: + self._training_stats = DdpgTrainingStats(episodes) + + # Initialize target Q1' and Q2' and mu' + self._actor_target = self._copy_actor() + self._critic_target = self._copy_critic() + self._critic2_target = self._copy_critic2() + + # Initialize l2 loss for critic network + l2_loss = gluon.loss.L2Loss() + + # Initialize critic and actor trainer + trainer_actor = gluon.Trainer( + self._actor.collect_params(), self._actor_optimizer, + self._actor_optimizer_params) + trainer_critic = gluon.Trainer( + self._critic.collect_params(), self._critic_optimizer, + self._critic_optimizer_params) + trainer_critic2 = gluon.Trainer( + self._critic2.collect_params(), self._critic2_optimizer, + self._critic2_optimizer_params) + + # For episode=1..n + while self._current_episode < episodes: + # Check interrupt flag + if self._check_interrupt_routine(): + return False + + # Initialize new episode + step = 0 + episode_reward = 0 + start = time.time() + episode_critic_loss = 0 + episode_actor_loss = 0 + episode_avg_q_value = 0 + training_steps = 0 + actor_updates = 0 + + # Get initialial observation state s + state = self._environment.reset() + + # For step=1..T + while step < self._max_episode_step: + # Select an action a = mu(s) + N(step) according to current + # actor and exploration noise N according to strategy + action = self._strategy.select_action( + self.get_next_action(state)) + self._strategy.decay(self._current_episode) + + # Execute action a and observe reward r and next state ns + next_state, reward, terminal, _ = \ + self._environment.step(action) + + self._logger.debug( + 'Applied action {} with reward {}'.format(action, reward)) + + # Store transition (s,a,r,ns) in replay buffer + self._memory.append( + state, action, reward, next_state, terminal) + + if self._do_training(): + # Sample random minibatch of b transitions + # (s_i, a_i, r_i, s_(i+1)) from replay buffer + states, actions, rewards, next_states, terminals =\ + self._sample_from_memory() + + clipped_noise = nd.array( + np.clip( + np.random.normal( + loc=0, scale=self._policy_noise, + size=self._minibatch_size + ).reshape(self._minibatch_size, 1), + -self._noise_clip, + self._noise_clip + ), + ctx=self._ctx + ) + target_action = np.clip( + self._actor_target(next_states) + clipped_noise, + self._strategy._action_low, + self._strategy._action_high) + + rewards = rewards.reshape(self._minibatch_size, 1) + terminals = terminals.reshape(self._minibatch_size, 1) + + target_qvalues1 = self._critic_target(next_states, + target_action) + target_qvalues2 = self._critic2_target(next_states, + target_action) + target_qvalues = nd.minimum(target_qvalues1, + target_qvalues2) + y = rewards + (1 - terminals) * self._discount_factor\ + * target_qvalues + + with autograd.record(): + qvalues1 = self._critic(states, actions) + critic1_loss = l2_loss(qvalues1, y) + critic1_loss.backward() + trainer_critic.step(self._minibatch_size) + + with autograd.record(): + qvalues2 = self._critic2(states, actions) + critic2_loss = l2_loss(qvalues2, y) + critic2_loss.backward() + trainer_critic2.step(self._minibatch_size) + + critic_loss = (critic1_loss.mean() + critic2_loss.mean())/2 + + if self._total_steps % self._policy_delay == 0: + tmp_critic = self._copy_critic() + with autograd.record(): + actor_loss = -tmp_critic( + states, self._actor(states)).mean() + actor_loss.backward() + trainer_actor.step(self._minibatch_size) + + # Update target networks: + self._actor_target = self._soft_update( + self._actor, self._actor_target, + self._soft_target_update_rate) + self._critic_target = self._soft_update( + self._critic, self._critic_target, + self._soft_target_update_rate) + self._critic2_target = self._soft_update( + self._critic2, self._critic2_target, + self._soft_target_update_rate) + + actor_updates = actor_updates + 1 + else: + actor_loss = nd.array([0], ctx=self._ctx) + + # Update statistics + episode_critic_loss +=\ + np.sum(critic_loss.asnumpy()) / self._minibatch_size + episode_actor_loss += 0 if actor_updates == 0 else\ + np.sum(actor_loss.asnumpy()[0]) + episode_avg_q_value +=\ + np.sum(target_qvalues.asnumpy()) / self._minibatch_size + + training_steps += 1 + + episode_reward += reward + step += 1 + self._total_steps += 1 + state = next_state + + if terminal: + # Reset the strategy + self._strategy.reset() + break + + # Log the episode results + episode_actor_loss = 0 if actor_updates == 0\ + else (episode_actor_loss / actor_updates) + episode_critic_loss = 0 if training_steps == 0\ + else (episode_critic_loss / training_steps) + episode_avg_q_value = 0 if actor_updates == 0\ + else (episode_avg_q_value / training_steps) + + avg_reward = self._training_stats.log_episode( + self._current_episode, start, training_steps, + episode_actor_loss, episode_critic_loss, episode_avg_q_value, + self._strategy.cur_eps, episode_reward) + + self._do_snapshot_if_in_interval(self._current_episode) + + if self._is_target_reached(avg_reward): + self._logger.info( + 'Target score is reached in average; Training is stopped') + break + + self._current_episode += 1 + + self._evaluate() + self.save_parameters(episode=self._current_episode) + self.export_best_network() + self._training_stats.save_stats(self._output_directory) + self._logger.info('--------- Training finished ---------') + return True + + def _make_config_dict(self): + config = super(TwinDelayedDdpgAgent, self)._make_config_dict() + config['policy_noise'] = self._policy_noise + config['noise_clip'] = self._noise_clip + config['policy_delay'] = self._policy_delay + return config + + +class DqnAgent(Agent): + def __init__( + self, + qnet, + environment, + replay_memory_params, + strategy_params, + state_dim, + action_dim, + ctx=None, + discount_factor=.9, + loss_function='l2', + optimizer='rmsprop', + optimizer_params={'learning_rate': 0.09}, + training_episodes=50, + start_training=0, + train_interval=1, + use_fix_target=False, + double_dqn=False, + target_update_interval=10, + snapshot_interval=200, + evaluation_samples=100, + agent_name='Dqn_agent', + max_episode_step=99999, + output_directory='model_parameters', + verbose=True, + target_score=None + ): + super(DqnAgent, self).__init__( + environment=environment, replay_memory_params=replay_memory_params, + strategy_params=strategy_params, state_dim=state_dim, + action_dim=action_dim, ctx=ctx, discount_factor=discount_factor, + training_episodes=training_episodes, start_training=start_training, + train_interval=train_interval, + snapshot_interval=snapshot_interval, agent_name=agent_name, + max_episode_step=max_episode_step, + output_directory=output_directory, verbose=verbose, + target_score=target_score, evaluation_samples=evaluation_samples) + + self._qnet = qnet + self._target_update_interval = target_update_interval + self._target_qnet = copy_net( + self._qnet, self._state_dim, ctx=self._ctx) + self._loss_function_str = loss_function + self._loss_function = get_loss_function(loss_function) + self._optimizer = optimizer + self._optimizer_params = optimizer_params + self._double_dqn = double_dqn + self._use_fix_target = use_fix_target + + # Build memory buffer for discrete actions + replay_memory_params['state_dim'] = state_dim + replay_memory_params['action_dim'] = (1,) + self._replay_memory_params = replay_memory_params + rm_builder = ReplayMemoryBuilder() + self._memory = rm_builder.build_by_params(**replay_memory_params) + self._minibatch_size = self._memory.sample_size + + # Initialize best network + self._best_net = copy_net(self._qnet, self._state_dim, self._ctx) + self._best_avg_score = -np.infty + + self._training_stats = None + + @classmethod + def resume_from_session(cls, session_dir, net, environment): + import pickle + if not os.path.exists(session_dir): + raise ValueError('Session directory does not exist') + + files = dict() + files['agent'] = os.path.join(session_dir, 'agent.p') + files['best_net_params'] = os.path.join(session_dir, 'best_net.params') + files['q_net_params'] = os.path.join(session_dir, 'qnet.params') + files['target_net_params'] = os.path.join( + session_dir, 'target_net.params') + + for file in files.values(): + if not os.path.exists(file): + raise ValueError( + 'Session directory is not complete: {} is missing' + .format(file)) + + with open(files['agent'], 'rb') as f: + agent = pickle.load(f) + + agent._environment = environment + agent._qnet = net + agent._qnet.load_parameters(files['q_net_params'], agent._ctx) + agent._qnet.hybridize() + agent._qnet(nd.random_normal( + shape=((1,) + agent._state_dim), ctx=agent._ctx)) + agent._best_net = copy_net(agent._qnet, agent._state_dim, agent._ctx) + agent._best_net.load_parameters(files['best_net_params'], agent._ctx) + agent._target_qnet = copy_net( + agent._qnet, agent._state_dim, agent._ctx) + agent._target_qnet.load_parameters( + files['target_net_params'], agent._ctx) + + agent._logger = ArchLogger.get_logger() + agent._training_stats.logger = agent._logger + agent._logger.info('Agent was retrieved; Training can be continued') + + return agent + + def _make_pickle_ready(self, session_dir): + super(DqnAgent, self)._make_pickle_ready(session_dir) + self._export_net(self._qnet, 'current_qnet') + self._export_net(self._qnet, 'qnet', filedir=session_dir) + self._qnet = None + self._export_net(self._target_qnet, 'target_net', filedir=session_dir) + self._target_qnet = None + + def get_q_values(self, state, with_best=False): + return self.get_batch_q_values( + nd.array([state], ctx=self._ctx), with_best=with_best)[0] + + def get_batch_q_values(self, state_batch, with_best=False): + return self._best_net(state_batch)\ + if with_best else self._qnet(state_batch) + + def get_next_action(self, state, with_best=False): + q_values = self.get_q_values(state, with_best=with_best) + action = q_values.asnumpy().argmax() + return action + + def __determine_target_q_values( + self, states, actions, rewards, next_states, terminals + ): + if self._use_fix_target: + q_max_val = self._target_qnet(next_states) + else: + q_max_val = self._qnet(next_states) + + if self._double_dqn: + q_values_next_states = self._qnet(next_states) + target_rewards = rewards + nd.choose_element_0index( + q_max_val, nd.argmax_channel(q_values_next_states))\ + * (1.0 - terminals) * self._discount_factor + else: + target_rewards = rewards + nd.choose_element_0index( + q_max_val, nd.argmax_channel(q_max_val))\ + * (1.0 - terminals) * self._discount_factor + + target_qval = self._qnet(states) + for t in range(target_rewards.shape[0]): + target_qval[t][actions[t]] = target_rewards[t] + + return target_qval + + def __train_q_net_step(self, trainer): + states, actions, rewards, next_states, terminals =\ + self._sample_from_memory() + target_qval = self.__determine_target_q_values( + states, actions, rewards, next_states, terminals) + with autograd.record(): + q_values = self._qnet(states) + loss = self._loss_function(q_values, target_qval) + loss.backward() + trainer.step(self._minibatch_size) + return loss + + def __do_target_update_if_in_interval(self, total_steps): + do_target_update = ( + self._use_fix_target and + (total_steps % self._target_update_interval == 0)) + if do_target_update: + self._logger.info( + 'Target network is updated after {} steps'.format(total_steps)) + self._target_qnet = copy_net( + self._qnet, self._state_dim, self._ctx) + + def train(self, episodes=None): + self.save_config_file() + self._logger.info("--- Start training ---") + trainer = gluon.Trainer( + self._qnet.collect_params(), + self._optimizer, + self._adjust_optimizer_params(self._optimizer_params)) + episodes = episodes if episodes is not None\ + else self._training_episodes + + resume = (self._current_episode > 0) + if resume: + self._logger.info("Training session resumed") + self._logger.info("Starting from episode {}".format( + self._current_episode)) + else: + self._training_stats = DqnTrainingStats(episodes) + + # Implementation Deep Q Learning described by + # Mnih et. al. in Playing Atari with Deep Reinforcement Learning + while self._current_episode < episodes: + if self._check_interrupt_routine(): + return False + + step = 0 + episode_reward = 0 + start = time.time() + state = self._environment.reset() + episode_loss = 0 + training_steps = 0 + while step < self._max_episode_step: + # 1. Choose an action based on current game state and policy + q_values = self._qnet(nd.array([state], ctx=self._ctx)) + action = self._strategy.select_action(q_values[0]) + self._strategy.decay(self._current_episode) + + # 2. Play the game for a single step + next_state, reward, terminal, _ =\ + self._environment.step(action) + + # 3. Store transition in replay memory + self._memory.append( + state, [action], reward, next_state, terminal) + + # 4. Train the network if in interval + if self._do_training(): + loss = self.__train_q_net_step(trainer) + training_steps += 1 + episode_loss +=\ + np.sum(loss.asnumpy()) / self._minibatch_size + + # Update target network if in interval + self.__do_target_update_if_in_interval(self._total_steps) + + step += 1 + self._total_steps += 1 + episode_reward += reward + state = next_state + + if terminal: + self._strategy.reset() + break + + self._do_snapshot_if_in_interval(self._current_episode) + + episode_loss = (episode_loss / training_steps)\ + if training_steps > 0 else 0 + avg_reward = self._training_stats.log_episode( + self._current_episode, start, training_steps, + episode_loss, self._strategy.cur_eps, episode_reward) + + if self._is_target_reached(avg_reward): + self._logger.info( + 'Target score is reached in average; Training is stopped') + break + + self._current_episode += 1 + + self._evaluate() + self.save_parameters(episode=self._current_episode) + self.export_best_network() + self._training_stats.save_stats(self._output_directory) + self._logger.info('--------- Training finished ---------') + return True + + def _make_config_dict(self): + config = super(DqnAgent, self)._make_config_dict() + config['optimizer'] = self._optimizer + config['optimizer_params'] = self._optimizer_params + config['loss_function'] = self._loss_function_str + config['use_fix_target'] = self._use_fix_target + config['double_dqn'] = self._double_dqn + config['target_update_interval'] = self._target_update_interval + return config + + def save_parameters(self, episode): + self._export_net( + self._qnet, self._agent_name + '_qnet', episode=episode) + + def _save_current_as_best_net(self): + self._best_net = copy_net( + self._qnet, self._state_dim, ctx=self._ctx) diff --git a/src/test/resources/target_code/td3/reinforcement_learning/cnnarch_logger.py b/src/test/resources/target_code/td3/reinforcement_learning/cnnarch_logger.py new file mode 100644 index 0000000000000000000000000000000000000000..19f36b1bcac866424ffc63bd5e90d65c7c4b0acc --- /dev/null +++ b/src/test/resources/target_code/td3/reinforcement_learning/cnnarch_logger.py @@ -0,0 +1,95 @@ +import logging +import sys +import os +import util + + +class ArchLogger(object): + _logger = None + + __output_level = logging.INFO + __logger_name = 'agent' + __output_directory = '.' + __append = True + __logformat = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + __dateformat = '%d-%b-%y %H:%M:%S' + + INFO = logging.INFO + DEBUG = logging.DEBUG + + @staticmethod + def set_output_level(output_level): + assert output_level is not None + ArchLogger.__output_level = output_level + + @staticmethod + def set_logger_name(logger_name): + assert logger_name is not None + ArchLogger.__logger_name = logger_name + + @staticmethod + def set_output_directory(output_directory): + assert output_directory is not None + ArchLogger.__output_directory = output_directory + + @staticmethod + def set_append(append): + assert append is not None + ArchLogger.__append = append + + @staticmethod + def set_log_format(logformat, dateformat): + assert logformat is not None + assert dateformat is not None + ArchLogger.__logformat = logformat + ArchLogger.__dateformat = dateformat + + @staticmethod + def init_logger(make_log_file=True): + assert ArchLogger._logger is None, 'Logger init already called' + filemode = 'a' if ArchLogger.__append else 'w' + formatter = logging.Formatter( + fmt=ArchLogger.__logformat, datefmt=ArchLogger.__dateformat) + + logger = logging.getLogger(ArchLogger.__logger_name) + logger.propagate = False + + if not logger.handlers: + logger.setLevel(ArchLogger.__output_level) + stream_handler = logging.StreamHandler(sys.stdout) + stream_handler.setLevel(ArchLogger.__output_level) + stream_handler.setFormatter(formatter) + logger.addHandler(stream_handler) + + if make_log_file: + util.make_directory_if_not_exist(ArchLogger.__output_directory) + log_file = os.path.join( + ArchLogger.__output_directory, + ArchLogger.__logger_name + '.log') + file_handler = logging.FileHandler(log_file, mode=filemode) + file_handler.setLevel(ArchLogger.__output_level) + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + ArchLogger._logger = logger + + @staticmethod + def get_logger(): + if ArchLogger._logger is None: + ArchLogger.init_logger() + assert ArchLogger._logger is not None + return ArchLogger._logger + +if __name__ == "__main__": + print('=== Test logger ===') + ArchLogger.set_logger_name('TestLogger') + ArchLogger.set_output_directory('test_log') + ArchLogger.init_logger() + logger = ArchLogger.get_logger() + logger.warning('This is a warning') + logger.debug('This is a debug information, which you should not see') + logger.info('This is a normal information') + assert os.path.exists('test_log')\ + and os.path.isfile(os.path.join('test_log', 'TestLogger.log')),\ + 'Test failed: No logfile exists' + import shutil + shutil.rmtree('test_log') \ No newline at end of file diff --git a/src/test/resources/target_code/td3/reinforcement_learning/environment.py b/src/test/resources/target_code/td3/reinforcement_learning/environment.py new file mode 100644 index 0000000000000000000000000000000000000000..381ec9eae2383af1d97fff93d0628ca945dc58e6 --- /dev/null +++ b/src/test/resources/target_code/td3/reinforcement_learning/environment.py @@ -0,0 +1,64 @@ +import abc +import logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +class Environment: + __metaclass__ = abc.ABCMeta + + def __init__(self): + pass + + @abc.abstractmethod + def reset(self): + pass + + @abc.abstractmethod + def step(self, action): + pass + + @abc.abstractmethod + def close(self): + pass + +import gym +class GymEnvironment(Environment): + def __init__(self, env_name, **kwargs): + super(GymEnvironment, self).__init__(**kwargs) + self.__seed = 42 + self.__env = gym.make(env_name) + self.__env.seed(self.__seed) + + @property + def state_dim(self): + return self.__env.observation_space.shape + + + @property + def number_of_actions(self): + return self.__env.action_space.n + + @property + def rewards_dtype(self): + return 'float32' + + def reset(self): + return self.__env.reset() + + def step(self, action): + return self.__env.step(action) + + def close(self): + self.__env.close() + + def action_space(self): + self.__env.action_space + + def is_in_action_space(self, action): + return self.__env.action_space.contains(action) + + def sample_action(self): + return self.__env.action_space.sample() + + def render(self): + self.__env.render() diff --git a/src/test/resources/target_code/td3/reinforcement_learning/replay_memory.py b/src/test/resources/target_code/td3/reinforcement_learning/replay_memory.py new file mode 100644 index 0000000000000000000000000000000000000000..d22147a4050c80a558730cdde5099094837a0247 --- /dev/null +++ b/src/test/resources/target_code/td3/reinforcement_learning/replay_memory.py @@ -0,0 +1,208 @@ +import numpy as np + + +class ReplayMemoryBuilder(object): + def __init__(self): + self.__supported_methods = ['online', 'buffer', 'combined'] + + def build_by_params( + self, + state_dim, + method='online', + state_dtype='float32', + action_dim=(1,), + action_dtype='uint8', + rewards_dtype='float32', + memory_size=1000, + sample_size=32 + ): + assert state_dim is not None + assert action_dim is not None + assert method in self.__supported_methods + + if method == 'online': + return self.build_online_memory( + state_dim=state_dim, state_dtype=state_dtype, + action_dtype=action_dtype, action_dim=action_dim, + rewards_dtype=rewards_dtype) + else: + assert memory_size is not None and memory_size > 0 + assert sample_size is not None and sample_size > 0 + if method == 'buffer': + return self.build_buffered_memory( + state_dim=state_dim, sample_size=sample_size, + memory_size=memory_size, state_dtype=state_dtype, + action_dim=action_dim, action_dtype=action_dtype, + rewards_dtype=rewards_dtype) + else: + return self.build_combined_memory( + state_dim=state_dim, sample_size=sample_size, + memory_size=memory_size, state_dtype=state_dtype, + action_dim=action_dim, action_dtype=action_dtype, + rewards_dtype=rewards_dtype) + + def build_buffered_memory( + self, state_dim, memory_size, sample_size, state_dtype, action_dim, + action_dtype, rewards_dtype + ): + assert memory_size > 0 + assert sample_size > 0 + return ReplayMemory( + state_dim, size=memory_size, sample_size=sample_size, + state_dtype=state_dtype, action_dim=action_dim, + action_dtype=action_dtype, rewards_dtype=rewards_dtype) + + def build_combined_memory( + self, state_dim, memory_size, sample_size, state_dtype, action_dim, + action_dtype, rewards_dtype + ): + assert memory_size > 0 + assert sample_size > 0 + return CombinedReplayMemory( + state_dim, size=memory_size, sample_size=sample_size, + state_dtype=state_dtype, action_dim=action_dim, + action_dtype=action_dtype, rewards_dtype=rewards_dtype) + + def build_online_memory( + self, state_dim, state_dtype, action_dtype, action_dim, rewards_dtype + ): + return OnlineReplayMemory( + state_dim, state_dtype=state_dtype, action_dim=action_dim, + action_dtype=action_dtype, rewards_dtype=rewards_dtype) + + +class ReplayMemory(object): + def __init__( + self, + state_dim, + sample_size, + size=1000, + action_dim=(1,), + state_dtype='float32', + action_dtype='uint8', + rewards_dtype='float32' + ): + assert size > 0, "Size must be greater than zero" + assert type(state_dim) is tuple, "State dimension must be a tuple" + assert type(action_dim) is tuple, "Action dimension must be a tuple" + assert sample_size > 0 + self._size = size + self._sample_size = sample_size + self._cur_size = 0 + self._pointer = 0 + self._state_dim = state_dim + self._state_dtype = state_dtype + self._action_dim = action_dim + self._action_dtype = action_dtype + self._rewards_dtype = rewards_dtype + self._states = np.zeros((self._size,) + state_dim, dtype=state_dtype) + self._actions = np.zeros( + (self._size,) + action_dim, dtype=action_dtype) + self._rewards = np.array([0] * self._size, dtype=rewards_dtype) + self._next_states = np.zeros( + (self._size,) + state_dim, dtype=state_dtype) + self._terminals = np.array([0] * self._size, dtype='bool') + + @property + def sample_size(self): + return self._sample_size + + def append(self, state, action, reward, next_state, terminal): + self._states[self._pointer] = state + self._actions[self._pointer] = action + self._rewards[self._pointer] = reward + self._next_states[self._pointer] = next_state + self._terminals[self._pointer] = terminal + + self._pointer = self._pointer + 1 + if self._pointer == self._size: + self._pointer = 0 + + self._cur_size = min(self._size, self._cur_size + 1) + + def at(self, index): + return self._states[index],\ + self._actions[index],\ + self._rewards[index],\ + self._next_states[index],\ + self._terminals[index] + + def is_sample_possible(self, batch_size=None): + batch_size = batch_size if batch_size is not None\ + else self._sample_size + return self._cur_size >= batch_size + + def sample(self, batch_size=None): + batch_size = batch_size if batch_size is not None\ + else self._sample_size + assert self._cur_size >= batch_size,\ + "Size of replay memory must be larger than batch size" + i = 0 + states = np.zeros(( + batch_size,)+self._state_dim, dtype=self._state_dtype) + actions = np.zeros( + (batch_size,)+self._action_dim, dtype=self._action_dtype) + rewards = np.zeros(batch_size, dtype=self._rewards_dtype) + next_states = np.zeros( + (batch_size,)+self._state_dim, dtype=self._state_dtype) + terminals = np.zeros(batch_size, dtype='bool') + + while i < batch_size: + rnd_index = np.random.randint(low=0, high=self._cur_size) + states[i] = self._states.take(rnd_index, axis=0) + actions[i] = self._actions.take(rnd_index, axis=0) + rewards[i] = self._rewards.take(rnd_index, axis=0) + next_states[i] = self._next_states.take(rnd_index, axis=0) + terminals[i] = self._terminals.take(rnd_index, axis=0) + i += 1 + + return states, actions, rewards, next_states, terminals + + +class OnlineReplayMemory(ReplayMemory): + def __init__( + self, state_dim, state_dtype='float32', action_dim=(1,), + action_dtype='uint8', rewards_dtype='float32' + ): + super(OnlineReplayMemory, self).__init__( + state_dim, sample_size=1, size=1, state_dtype=state_dtype, + action_dim=action_dim, action_dtype=action_dtype, + rewards_dtype=rewards_dtype) + + +class CombinedReplayMemory(ReplayMemory): + def __init__( + self, state_dim, sample_size, size=1000, state_dtype='float32', + action_dim=(1,), action_dtype='uint8', rewards_dtype='float32' + ): + super(CombinedReplayMemory, self).__init__( + state_dim=state_dim, sample_size=(sample_size - 1), size=size, + state_dtype=state_dtype, action_dim=action_dim, + action_dtype=action_dtype, rewards_dtype=rewards_dtype) + + self._last_state = np.zeros((1,) + state_dim, dtype=state_dtype) + self._last_action = np.array((1,) + action_dim, dtype=action_dtype) + self._last_reward = np.array([0], dtype=rewards_dtype) + self._last_next_state = np.zeros((1,) + state_dim, dtype=state_dtype) + self._last_terminal = np.array([0], dtype='bool') + + def append(self, state, action, reward, next_state, terminal): + super(CombinedReplayMemory, self).append( + state, action, reward, next_state, terminal) + self._last_state = state + self._last_action = action + self._last_reward = reward + self._last_next_state = next_state + self._last_terminal = terminal + + def sample(self, batch_size=None): + batch_size = (batch_size-1) if batch_size is not None\ + else self._sample_size + states, actions, rewards, next_states, terminals = super( + CombinedReplayMemory, self).sample(batch_size=batch_size) + states = np.append(states, [self._last_state], axis=0) + actions = np.append(actions, [self._last_action], axis=0) + rewards = np.append(rewards, [self._last_reward], axis=0) + next_states = np.append(next_states, [self._last_next_state], axis=0) + terminals = np.append(terminals, [self._last_terminal], axis=0) + return states, actions, rewards, next_states, terminals \ No newline at end of file diff --git a/src/test/resources/target_code/td3/reinforcement_learning/strategy.py b/src/test/resources/target_code/td3/reinforcement_learning/strategy.py new file mode 100644 index 0000000000000000000000000000000000000000..d2d2386658c56c5ebb6d654468669a2664622305 --- /dev/null +++ b/src/test/resources/target_code/td3/reinforcement_learning/strategy.py @@ -0,0 +1,218 @@ +import numpy as np + + +class StrategyBuilder(object): + def __init__(self): + pass + + def build_by_params( + self, + method='epsgreedy', + epsilon=0.5, + min_epsilon=0.05, + epsilon_decay_method='no', + epsilon_decay=0.0, + epsilon_decay_start=0, + epsilon_decay_per_step=False, + action_dim=None, + action_low=None, + action_high=None, + mu=0.0, + theta=0.5, + sigma=0.3, + noise_variance=0.1 + ): + + if epsilon_decay_method == 'linear': + decay = LinearDecay( + eps_decay=epsilon_decay, min_eps=min_epsilon, + decay_start=epsilon_decay_start, + decay_per_step=epsilon_decay_per_step) + else: + decay = NoDecay() + + if method == 'epsgreedy': + assert action_dim is not None + assert len(action_dim) == 1 + return EpsilonGreedyStrategy( + eps=epsilon, number_of_actions=action_dim[0], + decay_method=decay) + elif method == 'ornstein_uhlenbeck': + assert action_dim is not None + assert action_low is not None + assert action_high is not None + assert mu is not None + assert theta is not None + assert sigma is not None + return OrnsteinUhlenbeckStrategy( + action_dim, action_low, action_high, epsilon, mu, theta, + sigma, decay) + elif method == 'gaussian': + assert action_dim is not None + assert action_low is not None + assert action_high is not None + assert noise_variance is not None + return GaussianNoiseStrategy(action_dim, action_low, action_high, + epsilon, noise_variance, decay) + else: + assert action_dim is not None + assert len(action_dim) == 1 + return GreedyStrategy() + + +class BaseDecay(object): + def __init__(self): + pass + + def decay(self, *args): + raise NotImplementedError + + def __call__(self, *args): + return self.decay(*args) + + +class NoDecay(BaseDecay): + def __init__(self): + super(NoDecay, self).__init__() + + def decay(self, cur_eps, episode): + return cur_eps + + +class LinearDecay(BaseDecay): + def __init__(self, eps_decay, min_eps=0, decay_start=0, decay_per_step=False): + super(LinearDecay, self).__init__() + self.eps_decay = eps_decay + self.min_eps = min_eps + self.decay_start = decay_start + self.decay_per_step = decay_per_step + self.last_episode = -1 + + def do_decay(self, episode): + if self.decay_per_step: + do = (episode >= self.decay_start) + else: + do = ((self.last_episode != episode) and (episode >= self.decay_start)) + self.last_episode = episode + return do + + def decay(self, cur_eps, episode): + if self.do_decay(episode): + return max(cur_eps - self.eps_decay, self.min_eps) + else: + return cur_eps + + +class BaseStrategy(object): + def __init__(self, decay_method): + self._decay_method = decay_method + + def select_action(self, values, decay_method): + raise NotImplementedError + + def decay(self, episode): + self.cur_eps = self._decay_method.decay(self.cur_eps, episode) + + def reset(self): + pass + + +class EpsilonGreedyStrategy(BaseStrategy): + def __init__(self, eps, number_of_actions, decay_method): + super(EpsilonGreedyStrategy, self).__init__(decay_method) + self.eps = eps + self.cur_eps = eps + self.__number_of_actions = number_of_actions + + def select_action(self, values): + do_exploration = (np.random.rand() < self.cur_eps) + if do_exploration: + action = np.random.randint(low=0, high=self.__number_of_actions) + else: + action = values.asnumpy().argmax() + return action + + +class GreedyStrategy(BaseStrategy): + def __init__(self): + super(GreedyStrategy, self).__init__(None) + + def select_action(self, values): + return values.asnumpy().argmax() + + def decay(self): + pass + + +class OrnsteinUhlenbeckStrategy(BaseStrategy): + """ + Ornstein-Uhlenbeck process: dxt = theta * (mu - xt) * dt + sigma * dWt + where Wt denotes the Wiener process. + """ + def __init__( + self, + action_dim, + action_low, + action_high, + eps, + mu=0.0, + theta=.15, + sigma=.3, + decay=NoDecay() + ): + super(OrnsteinUhlenbeckStrategy, self).__init__(decay) + self.eps = eps + self.cur_eps = eps + + self._decay_method = decay + + self._action_dim = action_dim + self._action_low = action_low + self._action_high = action_high + + self._mu = np.array(mu) + self._theta = np.array(theta) + self._sigma = np.array(sigma) + + self.state = np.ones(self._action_dim) * self._mu + + def _evolve_state(self): + x = self.state + dx = self._theta * (self._mu - x)\ + + self._sigma * np.random.randn(len(x)) + self.state = x + dx + return self.state + + def reset(self): + self.state = np.ones(self._action_dim) * self._mu + + def select_action(self, values): + noise = self._evolve_state() + action = (1.0 - self.cur_eps) * values + (self.cur_eps * noise) + return np.clip(action, self._action_low, self._action_high) + + +class GaussianNoiseStrategy(BaseStrategy): + def __init__( + self, + action_dim, + action_low, + action_high, + eps, + noise_variance, + decay=NoDecay() + ): + super(GaussianNoiseStrategy, self).__init__(decay) + self.eps = eps + self.cur_eps = eps + + self._action_dim = action_dim + self._action_low = action_low + self._action_high = action_high + + self._noise_variance = noise_variance + + def select_action(self, values): + noise = np.random.normal(loc=0.0, scale=self._noise_variance, size=self._action_dim) + action = values + self.cur_eps * noise + return np.clip(action, self._action_low, self._action_high) diff --git a/src/test/resources/target_code/td3/reinforcement_learning/util.py b/src/test/resources/target_code/td3/reinforcement_learning/util.py new file mode 100644 index 0000000000000000000000000000000000000000..1d8b38c4a32a1cc6c0e364054e746e9115d7d05b --- /dev/null +++ b/src/test/resources/target_code/td3/reinforcement_learning/util.py @@ -0,0 +1,300 @@ +import signal +import sys +import numpy as np +import matplotlib.pyplot as plt +from matplotlib import style +import time +import os +import mxnet as mx +from mxnet import gluon, nd +import cnnarch_logger + +LOSS_FUNCTIONS = { + 'l1': gluon.loss.L1Loss(), + 'l2': gluon.loss.L2Loss(), + 'huber': gluon.loss.HuberLoss(), + 'softmax_cross_entropy': gluon.loss.SoftmaxCrossEntropyLoss(), + 'sigmoid_cross_entropy': gluon.loss.SigmoidBinaryCrossEntropyLoss()} + + +def make_directory_if_not_exist(directory): + if not os.path.exists(directory): + os.makedirs(directory) + + +def copy_net(net, input_state_dim, ctx): + assert isinstance(net, gluon.HybridBlock) + assert type(net.__class__) is type + + net2 = net.__class__() + net2.collect_params().initialize(mx.init.Zero(), ctx=ctx) + net2.hybridize() + net2(mx.nd.ones((1,) + input_state_dim, ctx=ctx)) + + params_of_net = [p.data() for _, p in net.collect_params().items()] + for i, (_, p) in enumerate(net2.collect_params().items()): + p.set_data(params_of_net[i]) + + return net2 + + +def copy_net_with_two_inputs(net, input_state_dim1, input_state_dim2, ctx): + assert isinstance(net, gluon.HybridBlock) + assert type(net.__class__) is type + + net2 = net.__class__() + net2.collect_params().initialize(mx.init.Zero(), ctx=ctx) + net2.hybridize() + net2( + nd.ones((1,) + input_state_dim1, ctx=ctx), + nd.ones((1,) + input_state_dim2, ctx=ctx)) + + params_of_net = [p.data() for _, p in net.collect_params().items()] + for i, (_, p) in enumerate(net2.collect_params().items()): + p.set_data(params_of_net[i]) + + return net2 + + +def get_loss_function(loss_function_name): + if loss_function_name not in LOSS_FUNCTIONS: + raise ValueError('Loss function does not exist') + return LOSS_FUNCTIONS[loss_function_name] + + +class AgentSignalHandler(object): + def __init__(self): + signal.signal(signal.SIGINT, self.interrupt_training) + self.__agent = None + self.__times_interrupted = 0 + + def register_agent(self, agent): + self.__agent = agent + + def interrupt_training(self, sig, frame): + self.__times_interrupted = self.__times_interrupted + 1 + if self.__times_interrupted <= 3: + if self.__agent: + self.__agent.set_interrupt_flag(True) + else: + print('Interrupt called three times: Force quit') + sys.exit(1) + +style.use('fivethirtyeight') + + +class TrainingStats(object): + def __init__(self, max_episodes): + self._logger = cnnarch_logger.ArchLogger.get_logger() + self._max_episodes = max_episodes + self._all_total_rewards = np.zeros((max_episodes,)) + self._all_eps = np.zeros((max_episodes,)) + self._all_time = np.zeros((max_episodes,)) + self._all_mean_reward_last_100_episodes = np.zeros((max_episodes,)) + + @property + def logger(self): + return self._logger + + @logger.setter + def logger(self, logger): + self._logger = logger + + @logger.deleter + def logger(self): + self._logger = None + + def add_total_reward(self, episode, total_reward): + self._all_total_rewards[episode] = total_reward + + def add_eps(self, episode, eps): + self._all_eps[episode] = eps + + def add_time(self, episode, time): + self._all_time[episode] = time + + def add_mean_reward_last_100(self, episode, mean_reward): + self._all_mean_reward_last_100_episodes[episode] = mean_reward + + def log_episode(self, *args): + raise NotImplementedError + + def mean_of_reward(self, cur_episode, last=100): + if cur_episode > 0: + reward_last_100 =\ + self._all_total_rewards[max(0, cur_episode-last):cur_episode] + return np.mean(reward_last_100) + else: + return self._all_total_rewards[0] + + def save(self, path, episode=None): + if episode is None: + episode = self._max_episodes + np.save(os.path.join(path, 'total_rewards'), self._all_total_rewards[:episode]) + np.save(os.path.join(path, 'eps'), self._all_eps[:episode]) + np.save(os.path.join(path, 'time'), self._all_time[:episode]) + np.save( + os.path.join(path, 'mean_reward'), + self._all_mean_reward_last_100_episodes[:episode]) + + def _log_episode(self, episode, start_time, training_steps, eps, reward): + self.add_eps(episode, eps) + self.add_total_reward(episode, reward) + end = time.time() + mean_reward_last_100 = self.mean_of_reward(episode, last=100) + time_elapsed = end - start_time + self.add_time(episode, time_elapsed) + self.add_mean_reward_last_100(episode, mean_reward_last_100) + return ('Episode: %d, Total Reward: %.3f, ' + 'Avg. Reward Last 100 Episodes: %.3f, {}, ' + 'Time: %.3f, Training Steps: %d, Eps: %.3f') % ( + episode, reward, mean_reward_last_100, time_elapsed, + training_steps, eps), mean_reward_last_100 + + +class DqnTrainingStats(TrainingStats): + def __init__(self, max_episodes): + super(DqnTrainingStats, self).__init__(max_episodes) + self._all_avg_loss = np.zeros((max_episodes,)) + + def add_avg_loss(self, episode, avg_loss): + self._all_avg_loss[episode] = avg_loss + + def log_episode( + self, episode, start_time, training_steps, avg_loss, eps, reward + ): + self.add_avg_loss(episode, avg_loss) + + info, avg_reward = self._log_episode( + episode, start_time, training_steps, eps, reward) + info = info.format(('Avg. Loss: %.3f') % (avg_loss)) + + self._logger.info(info) + return avg_reward + + def save_stats(self, path, episode=None): + if episode is None: + episode = self._max_episodes + + all_total_rewards = self._all_total_rewards[:episode] + all_avg_loss = self._all_avg_loss[:episode] + all_eps = self._all_eps[:episode] + all_mean_reward_last_100_episodes = self._all_mean_reward_last_100_episodes[:episode] + + fig = plt.figure(figsize=(20, 20)) + + sub_rewards = fig.add_subplot(221) + sub_rewards.set_title('Total Rewards per episode') + sub_rewards.plot( + np.arange(episode), all_total_rewards) + + sub_loss = fig.add_subplot(222) + sub_loss.set_title('Avg. Loss per episode') + sub_loss.plot(np.arange(episode), all_avg_loss) + + sub_eps = fig.add_subplot(223) + sub_eps.set_title('Epsilon per episode') + sub_eps.plot(np.arange(episode), all_eps) + + sub_rewards = fig.add_subplot(224) + sub_rewards.set_title('Avg. mean reward of last 100 episodes') + sub_rewards.plot(np.arange(episode), + all_mean_reward_last_100_episodes) + + self.save(path, episode=episode) + plt.savefig(os.path.join(path, 'stats.pdf')) + + def save(self, path, episode=None): + if episode is None: + episode = self._max_episodes + super(DqnTrainingStats, self).save(path, episode=episode) + np.save(os.path.join(path, 'avg_loss'), self._all_avg_loss[:episode]) + + +class DdpgTrainingStats(TrainingStats): + def __init__(self, max_episodes): + super(DdpgTrainingStats, self).__init__(max_episodes) + self._all_avg_critic_loss = np.zeros((max_episodes,)) + self._all_avg_actor_loss = np.zeros((max_episodes,)) + self._all_avg_qvalues = np.zeros((max_episodes,)) + + def add_avg_critic_loss(self, episode, avg_critic_loss): + self._all_avg_critic_loss[episode] = avg_critic_loss + + def add_avg_actor_loss(self, episode, avg_actor_loss): + self._all_avg_actor_loss[episode] = avg_actor_loss + + def add_avg_qvalues(self, episode, avg_qvalues): + self._all_avg_qvalues[episode] = avg_qvalues + + def log_episode( + self, episode, start_time, training_steps, actor_loss, + critic_loss, qvalues, eps, reward + ): + self.add_avg_actor_loss(episode, actor_loss) + self.add_avg_critic_loss(episode, critic_loss) + self.add_avg_qvalues(episode, qvalues) + + info, avg_reward = self._log_episode( + episode, start_time, training_steps, eps, reward) + info = info.format(( + 'Avg. Actor Loss: %.3f ' + 'Avg. Critic Loss: %.3f ' + 'Avg. Q-Values: %.3f') % (actor_loss, critic_loss, qvalues)) + + self.logger.info(info) + return avg_reward + + def save(self, path, episode=None): + if episode is None: + episode = self._max_episodes + super(DdpgTrainingStats, self).save(path, episode=episode) + np.save(os.path.join( + path, 'avg_critic_loss'), self._all_avg_critic_loss[:episode]) + np.save(os.path.join(path, 'avg_actor_loss'), self._all_avg_actor_loss[:episode]) + np.save(os.path.join(path, 'avg_qvalues'), self._all_avg_qvalues[:episode]) + + def save_stats(self, path, episode=None): + if episode is None: + episode = self._max_episodes + + all_total_rewards = self._all_total_rewards[:episode] + all_avg_actor_loss = self._all_avg_actor_loss[:episode] + all_avg_critic_loss = self._all_avg_critic_loss[:episode] + all_avg_qvalues = self._all_avg_qvalues[:episode] + all_eps = self._all_eps[:episode] + all_mean_reward_last_100_episodes = self._all_mean_reward_last_100_episodes[:episode] + + fig = plt.figure(figsize=(120, 120)) + + sub_rewards = fig.add_subplot(321) + sub_rewards.set_title('Total Rewards per episode') + sub_rewards.plot( + np.arange(episode), all_total_rewards) + + sub_actor_loss = fig.add_subplot(322) + sub_actor_loss.set_title('Avg. Actor Loss per episode') + sub_actor_loss.plot( + np.arange(episode), all_avg_actor_loss) + + sub_critic_loss = fig.add_subplot(323) + sub_critic_loss.set_title('Avg. Critic Loss per episode') + sub_critic_loss.plot( + np.arange(episode), all_avg_critic_loss) + + sub_qvalues = fig.add_subplot(324) + sub_qvalues.set_title('Avg. QValues per episode') + sub_qvalues.plot( + np.arange(episode), all_avg_qvalues) + + sub_eps = fig.add_subplot(325) + sub_eps.set_title('Epsilon per episode') + sub_eps.plot(np.arange(episode), all_eps) + + sub_rewards = fig.add_subplot(326) + sub_rewards.set_title('Avg. mean reward of last 100 episodes') + sub_rewards.plot(np.arange(episode), + all_mean_reward_last_100_episodes) + + self.save(path, episode=episode) + plt.savefig(os.path.join(path, 'stats.pdf')) diff --git a/src/test/resources/target_code/td3/start_training.sh b/src/test/resources/target_code/td3/start_training.sh new file mode 100644 index 0000000000000000000000000000000000000000..fed25e7f53adc0da813c289f7e3e2060dbc569ce --- /dev/null +++ b/src/test/resources/target_code/td3/start_training.sh @@ -0,0 +1,2 @@ +#!/bin/bash +python CNNTrainer_tD3Config.py \ No newline at end of file diff --git a/src/test/resources/valid_tests/ddpg-ros/RosActorNetwork.cnnt b/src/test/resources/valid_tests/ddpg-ros/RosActorNetwork.cnnt index 074f312740b8b165edfd29f1afc18508fce84c9f..371b5cb96f507954b0edaf6c620dec0e185e9dbc 100644 --- a/src/test/resources/valid_tests/ddpg-ros/RosActorNetwork.cnnt +++ b/src/test/resources/valid_tests/ddpg-ros/RosActorNetwork.cnnt @@ -9,10 +9,9 @@ configuration RosActorNetwork { state_topic : "/environment/state" action_topic : "/environment/action" reset_topic : "/environment/reset" + reward_topic: "/environment/reward" } - reward_function : reward.rewardFunction - agent_name : "ddpg-agent" num_episodes : 2500 diff --git a/src/test/resources/valid_tests/ddpg-ros/comp/RosCriticNetwork.cnna b/src/test/resources/valid_tests/ddpg-ros/comp/RosCriticNetwork.cnna index 8c9363e72ccfb9d36457de656b32e2c31beb120f..a17088e94003c69909d4ed8953c113979eb2f88f 100644 --- a/src/test/resources/valid_tests/ddpg-ros/comp/RosCriticNetwork.cnna +++ b/src/test/resources/valid_tests/ddpg-ros/comp/RosCriticNetwork.cnna @@ -1,4 +1,8 @@ -implementation Critic(state, action) { +architecture RosCriticNetwork { + def input Q(-oo:oo)^{8} state + def input Q(-1:1)^{3} action + def output Q(-oo:oo)^{1} qvalues + (state -> FullyConnected(units=300) -> Relu() -> @@ -10,5 +14,7 @@ implementation Critic(state, action) { ) -> Add() -> FullyConnected(units=600) -> - Relu() + Relu() -> + FullyConnected(units=1) -> + qvalues; } \ No newline at end of file diff --git a/src/test/resources/valid_tests/ddpg-ros/reward/RewardFunction.emadl b/src/test/resources/valid_tests/ddpg-ros/reward/RewardFunction.emadl deleted file mode 100644 index 0e9b97d92e6995082a995a953b1a87fbb4df3148..0000000000000000000000000000000000000000 --- a/src/test/resources/valid_tests/ddpg-ros/reward/RewardFunction.emadl +++ /dev/null @@ -1,15 +0,0 @@ -package reward; - -component RewardFunction { - ports - in Q^{16} state, - in B isTerminal, - out Q reward; - - implementation Math { - Q speed = state(15); - Q angle = state(1); - - reward = speed * cos(angle); - } -} \ No newline at end of file diff --git a/src/test/resources/valid_tests/ddpg/comp/CriticNetwork.cnna b/src/test/resources/valid_tests/ddpg/comp/CriticNetwork.cnna index 8c9363e72ccfb9d36457de656b32e2c31beb120f..17a50cf1f5724511b804a647aa9b59937b3e3aea 100644 --- a/src/test/resources/valid_tests/ddpg/comp/CriticNetwork.cnna +++ b/src/test/resources/valid_tests/ddpg/comp/CriticNetwork.cnna @@ -1,4 +1,8 @@ -implementation Critic(state, action) { +architecture CriticNetwork { + def input Q(-oo:oo)^{8} state + def input Q(-1:1)^{3} action + def output Q(-oo:oo)^{1} qvalues + (state -> FullyConnected(units=300) -> Relu() -> @@ -10,5 +14,7 @@ implementation Critic(state, action) { ) -> Add() -> FullyConnected(units=600) -> - Relu() + Relu() -> + FullyConnected(units=1) -> + qvalues; } \ No newline at end of file diff --git a/src/test/resources/valid_tests/td3/TD3Config.cnnt b/src/test/resources/valid_tests/td3/TD3Config.cnnt new file mode 100644 index 0000000000000000000000000000000000000000..c3a21340f09064899348de65b0a72d4e17939d73 --- /dev/null +++ b/src/test/resources/valid_tests/td3/TD3Config.cnnt @@ -0,0 +1,32 @@ +configuration TD3Config { + learning_method : reinforcement + rl_algorithm : td3-algorithm + critic : comp.criticNetwork + environment : gym { name:"CartPole-v1" } + soft_target_update_rate: 0.001 + policy_noise: 0.1 + noise_clip: 0.8 + policy_delay: 4 + + actor_optimizer : adam{ + learning_rate : 0.0001 + learning_rate_minimum : 0.00005 + learning_rate_decay : 0.9 + learning_rate_policy : step + } + critic_optimizer : rmsprop{ + learning_rate : 0.001 + learning_rate_minimum : 0.0001 + learning_rate_decay : 0.5 + learning_rate_policy : step + } + strategy : gaussian { + epsilon: 1.0 + min_epsilon: 0.001 + noise_variance: 0.3 + epsilon_decay_per_step: true + epsilon_decay_method: linear + epsilon_decay : 0.0001 + epsilon_decay_start: 50 + } +} \ No newline at end of file diff --git a/src/test/resources/valid_tests/td3/comp/CriticNetwork.cnna b/src/test/resources/valid_tests/td3/comp/CriticNetwork.cnna new file mode 100644 index 0000000000000000000000000000000000000000..17a50cf1f5724511b804a647aa9b59937b3e3aea --- /dev/null +++ b/src/test/resources/valid_tests/td3/comp/CriticNetwork.cnna @@ -0,0 +1,20 @@ +architecture CriticNetwork { + def input Q(-oo:oo)^{8} state + def input Q(-1:1)^{3} action + def output Q(-oo:oo)^{1} qvalues + + (state -> + FullyConnected(units=300) -> + Relu() -> + FullyConnected(units=600) + | + + action -> + FullyConnected(units=600) + ) -> + Add() -> + FullyConnected(units=600) -> + Relu() -> + FullyConnected(units=1) -> + qvalues; +} \ No newline at end of file