Skip to content
Snippets Groups Projects
Commit 3b45ca32 authored by Rawel's avatar Rawel
Browse files

added some methods for loading and training

parent cbd751bb
No related branches found
No related tags found
No related merge requests found
......@@ -2,14 +2,19 @@ import os
from glob import glob
from multiprocessing import Pool
from joblib import load, dump
from scipy.sparse import load_npz
from tqdm import tqdm
from Classifier.BagOfWords import BagOfWords
from Classifier.commit_features import CommitFeatures
from Classifier.svm import Svm
from Data.Database.db_repository import DBRepository
from Data.Utils.utils import get_config_nodes_repo_dict
abspath = os.path.dirname(os.path.abspath(__file__))
os.chdir(abspath)
def store_commit(mapping: dict):
print(f"PID {os.getpid()}: Storing VCC({mapping['vcc_sha']}) and Fixing({mapping['fixing_sha']})")
......@@ -25,7 +30,7 @@ def store_vcc(mapping: dict):
config_code = mapping["vcc_config_code"]
repo_path = get_config_nodes_repo_dict()[config_code].find("./path").text
directory = "Classifier/Training/vccs"
directory = "Training/vccs"
if mapping["determined_by_heuristic"] == 0:
directory = f"{directory}/{config_code.lower()}/ground_truth"
......@@ -116,7 +121,10 @@ def create_feature_vectors():
# # Save the validation set
def load_vcc_dataset():
def load_vcc_training_dataset(joblib_file=None):
if joblib_file is not None:
return load(joblib_file)
vcc_training = []
for feature_vector in glob("Training/vccs/*/ground_truth/*.npz"):
vcc_training.append([load_npz(feature_vector), 5])
......@@ -124,11 +132,70 @@ def load_vcc_dataset():
vcc_training.append([load_npz(feature_vector), 5])
for feature_vector in glob("Training/vccs/*/not_confident/*.npz"):
vcc_training.append([load_npz(feature_vector), 1])
print(f"Loaded {len(vcc_training)} VCCs")
os.makedirs("Vectors", exist_ok=True)
dump(vcc_training, "Vectors/vcc_training.joblib")
return vcc_training
def load_unclassified_training_dataset(joblib_file=None):
if joblib_file is not None:
return load(joblib_file)
unclassified_training = []
for feature_vector in glob("Training/unclassified/*/*.npz"):
unclassified_training.append([load_npz(feature_vector), feature_vector[-44:-4]])
print(f"Loaded {len(unclassified_training)} unclassified commits")
os.makedirs("Vectors", exist_ok=True)
dump(unclassified_training, "Vectors/unclassified_training.joblib")
return unclassified_training
def load_vcc_validation_dataset(joblib_file=None):
if joblib_file is not None:
return load(joblib_file)
vcc_validation = []
for feature_vector in glob("Validation/vccs/*/*.npz"):
vcc_validation.append([load_npz(feature_vector), feature_vector[-44:-4]])
print(f"Loaded {len(vcc_validation)} VCCs")
os.makedirs("Vectors", exist_ok=True)
dump(vcc_validation, "Vectors/vcc_validation.joblib")
return vcc_validation
def load_unclassified_validation_dataset(joblib_file=None):
if joblib_file is not None:
return load(joblib_file)
unclassified_validation = []
for feature_vector in glob("Validation/unclassified/*/*.npz"):
unclassified_validation.append([load_npz(feature_vector), feature_vector[-44:-4]])
print(f"Loaded {len(unclassified_validation)} unclassified commits")
os.makedirs("Vectors", exist_ok=True)
dump(unclassified_validation, "Vectors/unclassified_validation.joblib")
return unclassified_validation
def train_model():
svm = Svm()
c = 0.09
w = 0.2
vcc_training = load_vcc_training_dataset()
unclassified_training = load_unclassified_training_dataset()
svm.train_model(vcc_training, unclassified_training, c=c, weight=w)
svm.save_model(f"c-{c}w{w}")
def main():
store_commit_features()
BagOfWords.create_bag_of_words()
......@@ -136,7 +203,4 @@ def main():
if __name__ == "__main__":
db_repo = DBRepository()
print(db_repo.get_all_mappings())
# main()
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment