Skip to content
Snippets Groups Projects
Commit f0416e16 authored by Rawel's avatar Rawel
Browse files

Added method to get all entries with heuristic mapping to associate a vcc_sha

parent 7e5810f6
No related branches found
No related tags found
No related merge requests found
......@@ -629,6 +629,27 @@ class DBRepository:
finally:
self.close()
def get_all_entries_with_heuristic_mapping(self):
"""
Returns all entries from the database that have a heuristic mapping.
However, it only returns the entries that have not a VCC yet.
"""
if not self.cnx.is_connected():
self._open()
cursor = self.cnx.cursor(dictionary=True)
query = (
"SELECT * FROM `link_fixing_commit_vcc` WHERE `vcc_sha` IS NULL "
"AND JSON_LENGTH(`heuristic_mapping`) != 0;"
)
cursor.execute(query)
result = cursor.fetchall()
self.close()
return result
@staticmethod
def _save_references(cursor, cve: dict):
"""
......
import json
import os
from multiprocessing import Queue, Process
from Data.Database.db_repository import DBRepository
from Data.Heuristic.heuristic_parallel import calculate_chunk_size
from Data.Utils.CommitUtils import CommitUtils
from Data.Utils.utils import get_config_nodes_repo_dict
def worker(queue_in):
"""Worker function for the multiprocessing pool."""
while True:
chunk = queue_in.get()
if chunk is None:
break
run_worker(chunk)
def run_worker(chunk: list[dict]):
print(f"PID {os.getpid()} started processing {len(chunk)} mappings...")
db_repo = DBRepository()
for mapping in chunk:
try:
heuristic_mapping = json.loads(mapping["heuristic_mapping"])
most_blamed_commit = extract_most_blamed_commit(heuristic_mapping)
if most_blamed_commit:
if most_blamed_commit in db_repo.initial_commits:
print(f"PID {os.getpid()} ({mapping['mapping_id']}) Skipping initial commit...")
continue
repo_node = get_config_nodes_repo_dict()[mapping["vcc_config_code"]]
commit_utils = CommitUtils(True, repo_node, mapping["vcc_config_code"])
# We do not need to fetch by remote since the heuristic only uses local commits
commit_utils.get_commit_by_sha(most_blamed_commit, False)
db_repo.update_vcc_fixing_commit(
mapping["mapping_id"],
mapping["cve_id"],
most_blamed_commit,
mapping["vcc_config_code"],
mapping["confidence_value"],
mapping["heuristic_mapping"],
)
else:
print(f"Heuristic for mapping_id={mapping['mapping_id']} is ambiguous, skipping...")
except Exception as e:
print(f"PID {os.getpid()} ({mapping['mapping_id']}) Error: {e} ... skipping")
print(f"PID {os.getpid()} finished processing {len(chunk)} mappings.")
def extract_most_blamed_commit(heuristic_mapping):
"""
Extracts the most blamed commit from the given heuristic mapping.
If there are multiple commits with the same blame count, returns None.
:param heuristic_mapping: the heuristic mapping to extract the most blamed commit from
"""
most_blamed_count = max(heuristic_mapping.values())
most_blamed_commits = [k for k, v in heuristic_mapping.items() if v == most_blamed_count]
if len(most_blamed_commits) == 1:
return most_blamed_commits[0]
return None
def main():
num_cores = os.cpu_count()
db_repo = DBRepository()
mappings = db_repo.get_all_entries_with_heuristic_mapping()
chunk_size = calculate_chunk_size(mappings, num_cores)
print("Starting to process mappings...")
print("Total number of mappings:", len(mappings))
queue_in = Queue()
workers = []
for _ in range(num_cores):
worker_process = Process(target=worker, args=(queue_in,))
worker_process.start()
workers.append(worker_process)
chunks = [mappings[i : i + chunk_size] for i in range(0, len(mappings), chunk_size)]
for chunk in chunks:
queue_in.put(chunk)
for _ in range(num_cores):
queue_in.put(None)
for worker_process in workers:
worker_process.join()
print("Finished processing mappings")
if __name__ == "__main__":
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment