code upload

359b60a4 · Michael Scholkemper · 14aa8f11 · 359b60a4 · 359b60a4 · 359b60a4
Commit 359b60a4 authored 3 years ago by Michael Scholkemper
--- a/README.md
+++ b/README.md
 # Blind extraction of equitable partitions from graph signals
-The project code will be made available here.
+The extended paper is availible here and on arxiv. 
+
+## Code Installation
+
+Simply run `pip install -r code/requirements.txt` to install the required python packages or install them manually.
+
+## Code Usage
+
+To replicate the results of our paper, use the Experiments.ipynb, which will guide you through the process that lead to the figures displayed in the paper.
\ No newline at end of file
--- a/code/Experiments.ipynb
+++ b/code/Experiments.ipynb
--- a/code/blindEP_utils.py
+++ b/code/blindEP_utils.py
+import numpy as np
+import networkx as nx
+import copy
+from sklearn.mixture import BayesianGaussianMixture, GaussianMixture
+from sklearn.cluster import KMeans
+
+
+# HashFunction used in WL computation - essentially a lookup table
+# Has a dict that is indexed by some python hashable values
+class HashFunction:
+    def __init__(self):
+        self.reset()
+
+    #apply returns the hash for the index, if not in the dict a new value is returned and stored in the dict
+    def apply(self, value):
+        if value not in self.hash_dict:
+            self.hash_dict[value] = self.hash_counter
+            self.hash_counter += 1
+        return self.hash_dict[value]
+
+    #reset the hash-lookup-table
+    def reset(self):
+        self.hash_dict = {}
+        self.hash_counter = 2
+
+
+wl_hash = HashFunction()
+
+# Weisfeiler-Lehman algorithm
+# Input: 
+#   Graph graph1, 
+#   number of iterations, 
+#   if you want to stop early in case a stable partition is reached before the number of iterations is up,
+#   a hash_dict - in case you want to share it between mutliple runs of the WL 
+def weisfeiler_lehman(graph1: nx.Graph, iterations=-1, early_stopping=True, hash=wl_hash):
+    if iterations == -1:
+        iterations = len(graph1)
+
+    Gamma1 = np.ones(len(graph1), dtype=int)
+    set_colors_by_iteration = []
+    colors_by_iteration = []
+    #For the number of iterations
+    for t in range(iterations):
+        #copying is necessary, as WL has a synchronous update
+        tmp_Gamma1 = np.copy(Gamma1)
+        #storing for return with output
+        colors_by_iteration.append(copy.deepcopy(Gamma1))
+        set_colors_by_iteration.append(set(Gamma1))
+        #update colors of each node
+        for node in range(len(graph1)):
+            Gamma1[node] = hash.apply((Gamma1[node], tuple(sorted([tmp_Gamma1[n] for n in graph1[node]]))))
+        #if coloring is equivalent to previous iteration, stop early
+        if is_equivalent(Gamma1, tmp_Gamma1) and early_stopping:
+            return tmp_Gamma1, t, set_colors_by_iteration, colors_by_iteration
+
+    colors_by_iteration.append(copy.deepcopy(Gamma1))
+    set_colors_by_iteration.append(set(Gamma1))
+    return Gamma1, iterations, set_colors_by_iteration, colors_by_iteration
+
+#If the input is faulty - i.e. the constraints cannot be met - the behaviour of the function is undetermined.
+# Input:
+#   array with sizes of each class,
+#   matrix that gives the number of links between each class (this is A^\pi)
+#   flag if a non-simple graph should be returned - default is false meaning the returned graph is simple 
+def sample_csEP(nodes_per_group, links_to_other_group, multi_graph_okay=False):
+    group_offset = []
+    num_nodes = 0
+    links_needed = np.zeros((np.sum(nodes_per_group), len(nodes_per_group)), dtype=int)
+    #initialize links needed - this is essentially AH and determines for each node how many neighbours to each class it has.
+    for i in range(len(nodes_per_group)):
+        for node in range(nodes_per_group[i]):
+            links_needed[node + num_nodes, :] = links_to_other_group[i]
+        group_offset.append(num_nodes)
+        num_nodes += nodes_per_group[i]
+    #intialize adjacency matrix
+    adjacency_matrix = np.zeros((np.sum(nodes_per_group), np.sum(nodes_per_group)), dtype=int)
+    #add as many edges as necessary for each node and class
+    for group in range(len(nodes_per_group)):
+        for node in range(group_offset[group], group_offset[group] + nodes_per_group[group]):
+            for other_group in range(len(links_to_other_group[group])):
+                chosen_links = np.random.permutation([(node, i) for i in range(group_offset[other_group], group_offset[other_group] + nodes_per_group[other_group]) if links_needed[i, group] > 0 and node != i])
+                for link in list(chosen_links[:links_needed[node, other_group]]):
+                    adjacency_matrix[link[0], link[1]] = 1
+                    adjacency_matrix[link[1], link[0]] = 1
+                    links_needed[node, other_group] -= 1
+                    links_needed[link[1], group] -= 1
+                #For small graphs it is possible to have traversed all candidates before all constraints are met. Thus the procedure is repeated until the number of needed links is met.
+                while links_needed[node, other_group] > 0:
+                    chosen_links = np.random.permutation([(node, i) for i in range(group_offset[other_group], group_offset[other_group] + nodes_per_group[other_group]) if links_needed[i, group] > 0])
+                    link = chosen_links[0]
+                    adjacency_matrix[link[0], link[1]] += 1
+                    adjacency_matrix[link[1], link[0]] += 1 if link[0] != link[1] else 0
+                    links_needed[node, other_group] -= 1
+                    links_needed[link[1], group] -= 1 if node != link[1] else 0
+                    
+    
+    group_offset.append(np.sum(nodes_per_group))
+    links_to_other_group_per_node = []
+    for i,g in enumerate(nodes_per_group):
+        links_to_other_group_per_node.extend([links_to_other_group[i]] * g)
+    assert np.all([[np.sum([adjacency_matrix[i,j] for j in range(group_offset[group], group_offset[group+1])]) for group in range(len(nodes_per_group))] for i in range(len(adjacency_matrix))] == links_to_other_group_per_node)
+    #The output adjacency matrix may not be simple, therefore we try to make it simple.
+    adjacency_matrix = make_simple_graph(adjacency_matrix, nodes_per_group, links_to_other_group, group_offset)
+    assert np.all([[np.sum([adjacency_matrix[i,j] for j in range(group_offset[group], group_offset[group+1])]) for group in range(len(nodes_per_group))] for i in range(len(adjacency_matrix))] == links_to_other_group_per_node)
+    assert len([(i,j) for i in range(len(adjacency_matrix)) for j in range(len(adjacency_matrix)) if adjacency_matrix[i,j] > 1 or (i==j and adjacency_matrix[i,j] > 0)]) == 0
+    return adjacency_matrix
+
+# function to rewire edges that are multiedges or selfloops as these are undesirable, but do tend to exist in the sampling process.
+# Imput:
+#   preliminary output of sample_csEP (possibly non-simple random graph with EP)
+#   array with sizes of each class,
+#   matrix that gives the number of links between each class (this is A^\pi)   
+#   array with the first node of class i at the i-th index
+def make_simple_graph(adjacency_matrix, nodes_per_group, links_to_other_group, group_offset):
+    # first fix selfloops (to edges within a class)
+    diagonal_entries_to_fix = [(i,i) for i in range(len(adjacency_matrix)) if adjacency_matrix[i,i] > 0]
+    for edge in diagonal_entries_to_fix:
+        for i in range(adjacency_matrix[edge[0],edge[1]]):
+            # if the number of selfloops is greater or equal to 2, we have more possibilities to swap edges
+            if adjacency_matrix[edge[0],edge[1]] >= 2:
+                group_x = [g for g in range(len(nodes_per_group)) if edge[0] >= group_offset[g] and edge[0] < group_offset[g+1]][0]
+                group_y = [g for g in range(len(nodes_per_group)) if edge[1] >= group_offset[g] and edge[1] < group_offset[g+1]][0]
+
+                candidates_x = [n for n in range(group_offset[group_x], group_offset[group_x + 1]) if n != edge[1] and adjacency_matrix[edge[1], n] < 1]
+                candidates_y = [n for n in range(group_offset[group_y], group_offset[group_y + 1]) if n != edge[0] and adjacency_matrix[n, edge[0]] < 1]
+
+                choice = np.random.permutation([(x,y) for x in candidates_x for y in candidates_y if adjacency_matrix[x,y] >= 1 and x != y])[0]
+                adjacency_matrix[edge[0], choice[1]] = 1
+                adjacency_matrix[choice[1], edge[0]] = 1
+                adjacency_matrix[choice[0], edge[1]] = 1
+                adjacency_matrix[edge[1], choice[0]] = 1
+                
+                adjacency_matrix[edge[0], edge[1]] -= 1
+                adjacency_matrix[edge[1], edge[0]] -= 1 
+                adjacency_matrix[choice[0], choice[1]] -= 1
+                adjacency_matrix[choice[1], choice[0]] -= 1
+            # if there is only one self-loop, we have some other node that also has one selfloop and we connnect these
+            elif adjacency_matrix[edge[0],edge[1]] == 1:
+                group = [g for g in range(len(nodes_per_group)) if edge[0] >= group_offset[g] and edge[0] < group_offset[g+1]][0]
+                candidates = [n for n in range(group_offset[group], group_offset[group + 1]) if n != edge[0]]
+                choice = np.random.permutation([(x,x) for x in candidates if adjacency_matrix[x,x] >= 1])[0]
+
+                adjacency_matrix[edge[0], choice[1]] += 1
+                adjacency_matrix[choice[1], edge[0]] += 1
+                adjacency_matrix[edge[1], edge[0]] -= 1 
+                adjacency_matrix[choice[0], choice[1]] -= 1
+            else:
+                continue
+
+
+            
+    #fix multiedges
+    edges_to_fix =[(i,j) for i in range(len(adjacency_matrix)) for j in range(i+1,len(adjacency_matrix)) if adjacency_matrix[i,j] > 1 and i != j]
+   
+    # similar procedure to above, but fixing edges that are chosen mutliple times
+    for edge in edges_to_fix:
+        for i in range(adjacency_matrix[edge[0],edge[1]]-1):
+                group_x = [g for g in range(len(nodes_per_group)) if edge[0] >= group_offset[g] and edge[0] < group_offset[g+1]][0]
+                group_y = [g for g in range(len(nodes_per_group)) if edge[1] >= group_offset[g] and edge[1] < group_offset[g+1]][0]
+
+                candidates_x = [n for n in range(group_offset[group_x], group_offset[group_x + 1]) if n != edge[1] and adjacency_matrix[edge[1], n] < 1]
+                candidates_y = [n for n in range(group_offset[group_y], group_offset[group_y + 1]) if n != edge[0] and adjacency_matrix[n, edge[0]] < 1]
+
+                choice = np.random.permutation([(x,y) for x in candidates_x for y in candidates_y if adjacency_matrix[x,y] >= 1 and x != y])[0]
+                adjacency_matrix[edge[0], choice[1]] = 1
+                adjacency_matrix[choice[1], edge[0]] = 1
+                adjacency_matrix[choice[0], edge[1]] = 1
+                adjacency_matrix[edge[1], choice[0]] = 1
+                
+                adjacency_matrix[edge[0], edge[1]] -= 1
+                adjacency_matrix[edge[1], edge[0]] -= 1 
+                adjacency_matrix[choice[0], choice[1]] -= 1
+                adjacency_matrix[choice[1], choice[0]] -= 1
+                if adjacency_matrix[choice[0], choice[1]] > 0:
+                    return make_simple_graph(adjacency_matrix, nodes_per_group, links_to_other_group)
+
+    return adjacency_matrix
+
+
+# blindWL algorithm, 
+# Input:
+#   Sigma is the approximate oracle,
+#   maximum number of clusters that the GaussianMixtures can find,
+#   number of initialisations for the GaussianMixtures,
+#   method (either bgm - BayesianGaussianMixture or gm - GaussianMixture),
+#   Initial coloring (as partition indicator matrix),
+#   verbosity: 0 silent, 1 some intermediate output, 2 full output,
+#   flag if the output should be the partition indicator matrix or an array with the colors
+def blind_WL_from_outputs(Sigma, max_num_components=10, n_init=2, method="bgm", H=[], verbose=0, output_H=False):
+    #Sigma is the approximate oracle
+    #We can encorporate previous knowledge of the classes by inputting H, if not, H is the all-ones-vector
+    if len(H) == 0:
+        H = np.ones((Sigma.shape[0],1))
+        dim_B = 1
+    else:
+        B = np.array([[1 if np.isclose(i, H[j], atol=0.00001) else 0 for j in range(len(H))] for i in np.unique(np.array(H).round(decimals=5))])
+        H = B.transpose()
+        dim_B = H.shape[1]
+    
+    # We normalize Sigma for a better behaviour of the clustering algorithms
+    A = Sigma / np.linalg.norm(Sigma, 2)
+    if verbose == 2: print("A", A)
+    
+    # For a maximum number of |V| iterations we commpute the blindWL update
+    for i in range(Sigma.shape[1]):
+        if verbose == 2: print("H", H.shape, H)
+        H = H @ np.diag([1/sum(H[:,x]) for x in range(H.shape[1])]) * Sigma.shape[0]
+        X = A@H
+        if verbose == 2: print("H", H.shape, H)
+        if verbose == 1: print("X ",X.shape, X)
+        # Fitting the BayesianGaussianMixture to X = AH
+        if method == "bgm":
+            BGM = BayesianGaussianMixture(n_components=max_num_components, mean_precision_prior=0.01,weight_concentration_prior_type="dirichlet_distribution", n_init=n_init, max_iter=1000, covariance_type='tied')
+            colors = BGM.fit_predict(X)
+            # If some component weight is very small, i.e. not extremely relevant, we fit a simpler model
+            if min(BGM.weights_) < 1/(2*max_num_components):
+                BGM = BayesianGaussianMixture(n_components=sum(BGM.weights_ >=1/(2*max_num_components)), mean_precision_prior=0.01, weight_concentration_prior_type="dirichlet_distribution",n_init=n_init, max_iter=1000, covariance_type='tied')
+                colors = BGM.fit_predict(X)
+
+        # Fitting a GaussianMixture to X = AH        
+        elif method == "gm":
+            BGM = GaussianMixture(n_components=max_num_components)
+            colors = BGM.fit_predict(X)
+            # If some component weight is very small, i.e. not extremely relevant, we fit a simpler model
+            if min(BGM.weights_) < 1/(2*max_num_components):
+                BGM = GaussianMixture(n_components=sum(BGM.weights_ >=1/(2*max_num_components)))
+                colors = BGM.fit_predict(X)
+
+        # Compute the new partition idicator matrix B
+        if verbose == 1: print("colors ", colors)
+        B = [[1 if i == colors[j] else 0 for j in range(len(A))] for i in np.unique(colors)]
+        if verbose == 2: print("B ", np.array(B).transpose())
+
+        # If the number of clusters has not changed from the previous update, we stop early
+        if len(B) == dim_B:
+            # You can choose to output H or output an array assinging colors
+            if output_H:
+                return np.array(B).transpose()
+            else:
+                return colors
+        dim_B = len(B)
+        B = np.array(B).transpose()
+        H = B
+
+    # You can choose to output H or output an array assinging colors
+    if output_H:
+        return np.array(B).transpose()
+    else:
+        return colors
+
+# Spectral clustering algorithm
+# Input:
+#   Sample covariance matrix Sigma
+#   maximum number of clusters that the GaussianMixtures can find or the number of clusters for KMeans,
+#   number of initialisations for the GaussianMixtures and KMeans,
+#   number of top EVs to consider in the clustering, default is number of clusters
+#   method (either bgm - BayesianGaussianMixture or kmeans - KMeans),
+def EP_from_outputs(Sigma, max_num_components=10, n_init=2, num_top_vectors=0, method="bgm"):
+    if num_top_vectors == 0:
+        num_top_vectors = max_num_components
+    # Get the Eigenvectors
+    U,E,V = np.linalg.svd(Sigma)
+    U = U[:, :num_top_vectors]
+    best_score = np.NINF
+    best_predict = None
+    for i in range(1, max_num_components):
+        # Fit the models to the first i EVs and pick best as the prediction
+        if method == "bgm":
+            BGM = BayesianGaussianMixture(n_components=i, mean_precision_prior=0.01,weight_concentration_prior_type="dirichlet_distribution", n_init=n_init)
+            BGM_predict = BGM.fit_predict(U[:,:i])
+            score = BGM.score(U[:,:i])
+            if min(BGM.weights_) < 1/max_num_components:
+                return best_predict
+            else:
+                if score > best_score:
+                    best_score = score
+                    best_predict = BGM_predict
+        elif method == "kmeans":
+            KM = KMeans(n_clusters=max_num_components, n_init=n_init)
+            best_predict = KM.fit_predict(U[:,:max_num_components])
+    return best_predict
+
+
+# Subroutine of isequivalent, computes if c_1 refines c_2
+# Input:   
+#   color1,
+#   color2
+def is_equiv_subroutine(c1, c2):
+    color_map = {}
+    # for each node of c_1, store the color of nodes in c_2
+    for i in range(len(c1)):
+        if c1[i] not in color_map:
+            color_map[c1[i]] = c2[i]
+        # if the color has already been assigned, check that it is the same color
+        else:
+            # if not, c_1 does not refine c_2
+            if color_map[c1[i]] != c2[i]:
+                return False
+    return True
+
+
+# Computes if c_1 is equivalent to c_2
+# Input:   
+#   color1,
+#   color2
+def is_equivalent(c1, c2):
+    return is_equiv_subroutine(c1, c2) and is_equiv_subroutine(c2, c1)
--- a/code/combined_view2.png
+++ b/code/combined_view2.png
--- a/code/requirements.txt
+++ b/code/requirements.txt
+numpy
+networkx
+sklearn
+matplotlib
\ No newline at end of file