Source code for dynamix.generators.barabasi.tools

from __future__ import print_function, division, unicode_literals, absolute_import, generators
# -*- coding: utf-8 -*-
"""
Created on Mon June 02 18:44:40 2014

@author: Alex
"""

import itertools
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as scipy
import random
from itertools import chain
from timeit import Timer
from scipy.optimize import curve_fit
from math import sqrt


[docs]def random_subset(seq, m): """ Return m unique elements from seq. Parameters ---------- seq : list Nodelabels m : int number of nodes to be returned Returns ------- targets : list Notes ----- This differs from random.sample which can return repeated elements if seq holds repeated elements. Elements of the returned list are in order! Do not use if random order ist desired. """ targets = set() while len(targets) < m: x = random.choice(seq) targets.add(x) return targets
[docs]def get_repeated_node_list(g): """ Returns a list which contains every node repeated by the number of his degree. Parameters ---------- g : Graph Graph from which the repeated_node_list should be created from Returns ------- _ : list """ return list(chain(*[[a] * b for (a, b) in g.degree().items()])) # tmp = [[x]*y for (x, y) in zip(g.degree().keys(), g.degree().values())] # repeated_nodes = [j for i in tmp for j in i]
[docs]def barabasi_graph(nodes_prioritylist, m): """Return random graph using Barabási-Albert preferential attachment model. A graph of n nodes is grown by attaching new nodes each with m edges that are preferentially attached to existing nodes with high degree. Nodes are added in the same order as in nodes_prioritylist. Parameters ---------- nodes_prioritylist : List, string/int Number of nodes m : int Number of edges to attach from a new node to existing nodes Returns ------- g : Graph Notes ----- The initialization is a full connected graph with with m nodes. References ---------- .. [1] A. L. Barabási and R. Albert "Emergence of scaling in random networks", Science 286, pp 509-512, 1999. """ if m < 1 or m > len(nodes_prioritylist): raise nx.NetworkXError( u'Barabási-Albert network must have m>=1 and m<n, m={0:d},n={1:d}'.format(m, len(nodes_prioritylist))) # create empty start graph g = nx.empty_graph(0) g.name = u"custom_barabasi_albert_graph({0:d},{1:d})".format(len(nodes_prioritylist), m) # Add m initial nodes (m0 in barabasi-speak) g.add_nodes_from(nodes_prioritylist[:m]) edges = itertools.combinations(nodes_prioritylist[:m], 2) g.add_edges_from(edges) # Target nodes for new edges # (m+1)-node connects to all m ndoes targets = nodes_prioritylist[:m] # List of existing nodes, with nodes repeated once for each adjacent edge repeated_nodes = get_repeated_node_list(g) # Start adding the other n-m nodes. The first node is m. for n in nodes_prioritylist[m:]: # Add edges to m nodes from the source. g.add_edges_from(zip([n] * m, targets)) # Add one node to the list for each new edge just created. repeated_nodes.extend(targets) # And the new node "source" has m edges to add to the list. repeated_nodes.extend([n] * m) # Now choose m unique nodes from the existing nodes # Pick uniformly from repeated_nodes (preferential attachement) targets = random_subset(repeated_nodes, m) return g
[docs]def add_nodes_to_barabasi_graph(g, nodes_to_add, m): """Adds nodes to existing graph of the Barabási-Albert using the preferential attachment model. Graph g is grown by attaching new nodes each with m edges that are preferentially attached to existing nodes with high degree. Parameters ---------- g : Graph Graph where nodes will be added nodes_to_add : List of nodelabels nodes which will be added to graph g m : int number of edges per new node Returns ------- g : Graph Notes ----- Nodes will be added in order of the list "nodes_to_add". """ if set(g.nodes()) & set(nodes_to_add): raise nx.NetworkXError( u'Some nodes of nodes_to_add already exist in the graph') if len(set(nodes_to_add)) != len(nodes_to_add): raise nx.NetworkXError( u'There are duplicate elements in nodes_to_add') # what to do if graph has less then m edges targets = g.nodes() while len(g.nodes()) < m: node = nodes_to_add.pop(0) g.add_edges_from(zip([node] * len(targets), targets)) targets.extend(node) # when nodelist is empty return g if not nodes_to_add: return g repeated_nodes = get_repeated_node_list(g) targets = random_subset(repeated_nodes, m) for n in nodes_to_add: g.add_edges_from(zip([n] * m, targets)) # Add one node to the list for each new edge just created. repeated_nodes.extend(targets) # And the new node "source" has m edges to add to the list. repeated_nodes.extend([n] * m) # Now choose m unique nodes from the existing nodes # Pick uniformly from repeated_nodes (preferential attachement) targets = random_subset(repeated_nodes, m) return g
[docs]def repair_graph(g, m): """ Try to repair a given graph so it is connected and follows the barabasi model. Three step process 1) connect unconnected components 2) add edges for nodes with a degree less than m 3) add additional edges till g.edges = (g.node-m)*m + 0.5*m*(m-1) Parameters ---------- g : Graph Graph which will be splitted m : int Minimal number of edges per node Notes ----- The given graph will be fixed inplace. No return needed. """ #print("\n---------------" # "\nrepairing graph") #nothing to repair for graphs of size n < m if len(g.nodes()) <= m: return nx.complete_graph(n=len(g.nodes()), create_using=g) maximum_nr_of_edges = max((len(g.nodes()) - m) * m, 0) + \ min(0.5 * m * (m - 1), 0.5 * len(g.nodes()) * (len(g.nodes()) - 1)) nr_of_available_edges = maximum_nr_of_edges - nx.number_of_edges(g) #print("nr_of_available_edges", nr_of_available_edges) # connect unconnected components randomly cc = list(nx.connected_component_subgraphs(g)) while len(cc) > 1: if nr_of_available_edges == 0: return repair_graph(g, m+1) #print("use edge to connect two components") components_to_connect = list(random_subset(cc, 2)) #prefer nodes with not enough edges cause the number of edges to add can be very limited #get node of low_connected_nodes_1 = [x for (x, y) in components_to_connect[0].degree().items() if y < m] if low_connected_nodes_1: #print("choose low connected node") node_1 = random.choice(low_connected_nodes_1) else: #print("choose random node") if components_to_connect[0].number_of_nodes() == 1: node_1 = components_to_connect[0].nodes()[0] else: node_1 = random.choice(get_repeated_node_list(components_to_connect[0])) low_connected_nodes_2 = [x for (x, y) in components_to_connect[1].degree().items() if y < m] if low_connected_nodes_2: #print("choose low connected node") node_2 = random.choice(low_connected_nodes_2) else: #print("choose random node") if components_to_connect[1].number_of_nodes() == 1: node_2 = components_to_connect[1].nodes()[0] else: node_2 = random.choice(get_repeated_node_list(components_to_connect[1])) component = nx.union(components_to_connect[0], components_to_connect[1]) cc.remove(components_to_connect[0]) cc.remove(components_to_connect[1]) cc.append(component) g.add_edge(node_1, node_2) component.add_edge(node_1, node_2) #print("connect components through:", node_1, " and ", node_2) nr_of_available_edges -= 1 #cc = list(nx.connected_component_subgraphs(g)) #print("nr_of_connected_components", len(cc)) #find nodes with less than m edges and connect them nodes_in_need = {x: (m - y) for (x, y) in g.degree().items() if y < m} while nodes_in_need: if nr_of_available_edges == 0: return repair_graph(g, m+1) #print("use edge to reach m edges") if len(nodes_in_need) > 1: #if two or more nodes need to be connected, choose from those edge_nodes = list(random_subset(list(nodes_in_need.keys()), 2)) g.add_edge(edge_nodes[0], edge_nodes[1]) nr_of_available_edges -= 1 #print("improve degree values of:", edge_nodes) else: #if just one is left connect it via preferential attchement edge_nodes = list(nodes_in_need.keys()) #choose of repeated node list excluding the current element targets = list(chain(*[[a] * b for (a, b) in g.degree().items() if a is not edge_nodes[0]])) target = random.choice(targets) #add edge between low connected node and preferential attachment node g.add_edge(edge_nodes[0], target) nr_of_available_edges -= 1 #print("improve degree value of:", edge_nodes) #reduce node need and remove if not in need anymore for e in edge_nodes: nodes_in_need[e] -= 1 if nodes_in_need[e] == 0: del nodes_in_need[e] #todo can be improved to get near the power law repeated_nodes = get_repeated_node_list(g) while nr_of_available_edges > 0: #print("use edge for preferential atachement") edge_nodes = list(random_subset(repeated_nodes, 2)) if not g.has_edge(edge_nodes[0], edge_nodes[1]): g.add_edge(edge_nodes[0], edge_nodes[1]) repeated_nodes.extend(edge_nodes) nr_of_available_edges -= 1 #print("preferential attachment value of:", edge_nodes) #print("---------------")
[docs]def sort_nodes_by_degree(g1, g2=None): """Returns a nodelist sorted by node degree in decreasing order Parameters ---------- g1 : Graph Graph 1 g2 : Graph, optional Graph 2 Returns ------- nodes : list nodelist sorted by node degree in decreasing order """ if g2 is None: tmp = list(g1.degree().items()) else: tmp = list(chain(g1.degree().items(), g2.degree().items())) tmp.sort(key=lambda tup: tup[1], reverse=True) return [x[0] for x in tmp] # Beschleunigung 2.4x # keys = list(g1.degree().keys()) + list(g2.degree().keys()) #values = list(g1.degree().values()) + list(g2.degree().values()) #return [x for (y, x) in sorted(zip(values, keys), reverse=True)]
[docs]def compare_merge(graph, subgraph_1, subgraph_2): """Calls calculation of rank correlation coefficients and edge similarity measure. Compares attributes of graph g1, g2 and their merged graph g. Parameters ---------- g1 : Subgraph 1 of g nodes which will be added to graph g g2 : Subgraph 2 of g number of edges per new node merged_graph : Graph Graph where nodes will be added Notes ----- Requirement: merged_graph.nodes() = g1.nodes + g2.nodes() """ # calculate rank correlation coefficients degree_rank_correlation(graph, subgraph_1, subgraph_2) # t = Timer(lambda: degree_rank_correlation(merged_graph, g1, g2)) #print(t.timeit(1), "degree_rank_correlation_b(g1, g2, merged_graph)") #calculate edge similarity edge_similarity(graph, subgraph_1, subgraph_2) #t = Timer(lambda: edge_similarity(graph, subgraph_1, subgraph_2)) #print(t.timeit(1), "edge_similarity(merged_graph, g1, g2)") return 0
[docs]def degree_rank_correlation(graph, subgraph_1, subgraph_2): """Calculates rank correlation coefficients. Calculates Spearmans r (with tie correction) and Kendalls tau for the node degree distribution of g1, g2 and merged_graph Parameters ---------- g : Graph Initial graph g1 : Graph Subgraph 1 of g g2 : Graph Subgraph 2 of g Notes ----- Requirement: g1.nodes + g2.nodes() = gn.nodes() """ # node degree values of subgraphs ordered by nodename tmp = list(chain(subgraph_1.degree().items(), subgraph_2.degree().items())) tmp.sort(key=lambda tup: tup[0], reverse=True) subgraph_ranking = [x[1] for x in tmp] # node degree values of graph ordered by nodename tmp = list(graph.degree().items()) tmp.sort(key=lambda tup: tup[0], reverse=True) graph_ranking = [x[1] for x in tmp] print("Rank correlation coefficients of node degree distribution") spearmanr = scipy.stats.spearmanr(subgraph_ranking, graph_ranking) #t = Timer(lambda: scipy.stats.spearmanr(old_ranking, new_ranking)) #print(t.timeit(1), "spearmanr = scipy.stats.spearmanr(old_ranking, new_ranking)") print("spearmanr", spearmanr) kenndallt = scipy.stats.kendalltau(subgraph_ranking, graph_ranking) #t = Timer(lambda: scipy.stats.kendalltau(old_ranking, new_ranking)) #print(t.timeit(1), "kenndallt = scipy.stats.kendalltau(old_ranking, new_ranking)") print("Kenndalls tau", kenndallt) return spearmanr, kenndallt
[docs]def edge_similarity(graph, subgraph_1, subgraph_2): """Calculates Edge simlarity measures for graph and its subgraphs Compares the number of edges represented in a graph and its subgraphs. Outputs the full similarity matrix for both subgraphs. Parameters ---------- graph : Graph Initial graph subgraph1 : Graph Subgraph 1 of g subgraph2 : Graph Subgraph 2 of g Returns ------- similarity_matrix : float Notes ----- +------+-------+ | | G | +======+===+===+ | | a | b | |**G'**+---+---+ | | c | d | +------+---+---+ +-----+---------------------------+--------+ | val | Desciption | index | +=====+===========================+========+ | a | in G and G' | [0,0] | +-----+---------------------------+--------+ | b | only in G' | [0,1] | +-----+---------------------------+--------+ | c | only in G | [1,0] | +-----+---------------------------+--------+ | d | not present in G and G' | [1,1] | +-----+---------------------------+--------+ """ # use set operations for improved speed e_subgraph_1 = {(a,b) if a<b else (b,a) for (a,b) in subgraph_1.edges()} n_subgraph_1 = set(subgraph_1.nodes()) e_subgraph_2 = {(a,b) if a<b else (b,a) for (a,b) in subgraph_2.edges()} n_subgraph_2 = set(subgraph_2.nodes()) e_graph = {(a,b) if a<b else (b,a) for (a,b) in graph.edges()} # similarity matrix for graph and subgraph_1 g1_edge_counter = np.matrix([[0, 0], [0, 0]]) #edges only in subgraph_1 g1_edge_counter[1, 0] = len(e_subgraph_1 - e_graph) #edges in graph and subgraph_1 g1_edge_counter[0, 0] = len(e_subgraph_1) - g1_edge_counter[1, 0] #all edges in graph only containing nodes of subgraph_1 tmp_edges = {e for e in e_graph if e[0] in n_subgraph_1 and e[1] in n_subgraph_1} #edges in graph that would be possible in subgraph_1 but are not present g1_edge_counter[0, 1] = len(tmp_edges) - g1_edge_counter[0, 0] #all possible edges of subgraph_1 possible_edges = len(n_subgraph_1) * (len(n_subgraph_1) - 1) / 2 #possible edges of subgraph_1 that are not present in graph nor subgraph_1 g1_edge_counter[1, 1] = possible_edges - g1_edge_counter[0, 0] - g1_edge_counter[1, 0] - g1_edge_counter[0, 1] #similarity matrix for graph and subgraph_2 g2_edge_counter = np.matrix([[0, 0], [0, 0]]) #edges only in subgraph_2 g2_edge_counter[1, 0] = len(e_subgraph_2 - e_graph) #edges in graph and subgraph_2 g2_edge_counter[0, 0] = len(e_subgraph_2) - g2_edge_counter[1, 0] #all edges in graph only containing nodes of subgraph_2 tmp_edges = {e for e in e_graph if e[0] in n_subgraph_2 and e[1] in n_subgraph_2} #edges in graph that would be possible in subgraph_2 but are not present g2_edge_counter[0, 1] = len(tmp_edges) - g2_edge_counter[0, 0] #all possible edges of subgraph_1 possible_edges = len(n_subgraph_2) * (len(n_subgraph_2) - 1) / 2 #possible edges of subgraph_1 that are not present in graph nor subgraph_1 g2_edge_counter[1, 1] = possible_edges - g2_edge_counter[0, 0] - g2_edge_counter[1, 0] - g2_edge_counter[0, 1] #output print("nr of edges g1: ", subgraph_1.number_of_edges()) print("nr of edges g2: ", subgraph_2.number_of_edges()) print("nr of edges merged_graph: ", graph.number_of_edges()) print("edge counters g1:\n", g1_edge_counter) print("edge counters g2:\n", g2_edge_counter) #print("nr of edges merged graph: ", len(e_merged)) #print("% of preserved edges at merge: ", similarity) return g1_edge_counter, g2_edge_counter
[docs]def check_power_law(g): """Estimates as power law fit and returns the exponent and the rmse Uses the degree distribution to estimate the exponent a from P(k) ~ k ^ a Parameters ---------- graph : Graph Graph to analyse Returns ------- exponent: double Exponent of function fit. RMSE: double Root mean squared error """ cnt_nodes = nx.number_of_nodes(g) cnt_edges = nx.number_of_edges(g) degree_dist = {} min_degree = cnt_nodes max_degree = 1 for n,d in g.degree().items(): if d in degree_dist: degree_dist[d] += 1 else: degree_dist[d] = 1 min_degree = min(d, min_degree) max_degree = max(d, max_degree) obs = [] est = [] popt, pcov = curve_fit((lambda x, exp, norm: x**exp * norm), list(degree_dist.keys()), list(degree_dist.values())) exponent = popt[0] est_dist = get_degree_distribution(cnt_edges, cnt_nodes, min_degree=min_degree, max_degree=max_degree) #print(popt, pcov) last = False for k in range(min_degree, max_degree+1): #if obs and (est[-1] < 5 or obs[-1]<5): # #last = True ##if last: # obs[-1] += degree_dist[k] if k in degree_dist else 0 # est[-1] += est_dist[k] #else: obs.append(degree_dist[k] if k in degree_dist else 0) est.append(est_dist[k]) #print(len(obs),obs[-1],est[-1]) #print(list(map(lambda x,y:abs(x-y), est,obs))) #est_sample = list(chain([[d]*cnt for (d,cnt) in est_dist.items()])) #obs_sample = list(chain([[d]*cnt for (d,cnt) in degree_dist.items()])) #print(ks_2samp(est_sample,obs_sample)) rmse = sqrt(np.sum((np.square(np.array(obs)-np.array(est))))/len(obs)) #print(rmse) #plt.plot(est) #plt.plot(obs) #plt.legend(["est","obs"]) #plt.show() return float(exponent), float(rmse)
[docs]def estimate_m(g, g2=None, model_correction=True): """Estimates parameter m for graphs following the Barabási-Albert model. Estimates the parameter m by counting edges and vertices of graph g. Additionally estimates a shared m if two graphs are used as input parameters. Parameters ---------- g : Graph Graph for estimating m g2 : Graph, optional Second graph which will be included in the estimation for a shared m (default = None) model_correction : boolean, optional Was a model_correction used for the creation of graph g and g2. (default = True) True = Barabási-Albert model starting with a complete graph of m nodes False = Barabási-Albert model starting with an empty graph of m nodes Returns ------- m : int Notes ----- Estimate will be influenced by nodes not following the Barabási-Albert model e.g. outliers connected to every node in the graph. """ if g2 is None: # calculate m for single graph g e = len(g.edges()) n = len(g.nodes()) if model_correction: discriminant_term = (n - 0.5) ** 2 - 2 * e if discriminant_term < 0: raise nx.NetworkXError(u"m couldn't be estimated, because of too much noise") m = n - 0.5 - np.sqrt(discriminant_term) else: discriminant_term = (n / 2) ** 2 - e if discriminant_term < 0: raise nx.NetworkXError(u"m couldn't be estimated, because of too much noise") m = n / 2 - np.sqrt(discriminant_term) return int(m) else: # calculate m for combined graph of g and g2 # trivial case m1 and m2 are the same -> return m1 m1 = estimate_m(g, model_correction=model_correction) m2 = estimate_m(g2, model_correction=model_correction) if m1 == m2: return int(m1) print("estimation of merged_m not proofed yet") n = len(g.nodes()) + len(g2.nodes()) e1 = len(g.edges()) e2 = len(g2.edges()) #number of edges if nodes of g1 would be added to g2 e12 = (n - m2) * m2 + (m2 * (m2 - 1) / 2) #number of edges if nodes of g2 would be added to g1 e21 = (n - m1) * m1 + (m1 * (m1 - 1) / 2) #e will be min(e12,e21) =< e =< max(e12,e21) # and e1 + e2 =< e because in a merge the first m-1 nodes of one graph will connect with m edges print("number of edges g1+g2", len(g.edges()) + len(g2.edges())) for m in range(min(m1, m2), max(m1, m2) + 1): if model_correction: e = (n - m) * m + (m * (m - 1) / 2) else: e = (n - m) * m print("m = ", m, "; e = ", e) #if e > len(g.edges()) + len(g2.edges()): #if e >= min(e12, e21) and e >= e1 + e2: if e >= e1 + e2: return int(m) raise nx.NetworkXError(u"m couldn't be estimated") #todo maybe change to weighted mean of m1 and m2 #weighted mean of m1 and m2 #return (len(g.nodes) * m1 + len(g2.nodes) * m2) // n
[docs]def get_degree_distribution(number_of_edges, number_of_nodes=None, exponent=-2.9, min_degree=1, max_degree=None): """ Estimates the degree distribution Calculate estimated numbers ob nodes with the given node degree with P(k) ~ k ** exponent Parameters ---------- number_of_edges : Integer The number of estimated edges number_of_nodes : Integer, optional The number of nodes in the graph. If not given number_of_nodes = number_of_edges + 1. It is only used to get the maximum node degree exponent : float, optional The exponent used for the estimation. If not given -2.9 is used, as given in Barabasis Paper Returns ------- Dict Estimated node degree distribution """ if not number_of_nodes: number_of_nodes = number_of_edges + 1 if not max_degree: max_degree = number_of_nodes -1 exp = {k: k**exponent for k in range(min_degree, max_degree+1)} s = sum(exp.values()) norm = number_of_nodes / s return { k: val * norm for (k, val) in exp.items() }
if __name__ == "__main__": for degree, no in get_degree_distribution(20, 10).items(): print(degree, no, sep="\t") """ print("testcase estimate_m") print("test barabasi graphs starting with empty graph of m nodes") g1 = nx.barabasi_albert_graph(10, 2) print("estimated m of g1:", estimate_m(g1, model_correction=False)) g2 = nx.barabasi_albert_graph(20, 5) print("estimated m of g2:", estimate_m(g2, model_correction=False)) g3 = nx.barabasi_albert_graph(30, 10) print("estimated m of g3:", estimate_m(g3, model_correction=False)) print("estimated m of g1 and g2:", estimate_m(g1, g2, model_correction=False)) print("estimated m of g1 and g3:", estimate_m(g1, g3, model_correction=True)) print("estimated m of g2 and g3:", estimate_m(g2, g3, model_correction=True)) print("\n") print("test barabasi graphs starting with complete graph of m nodes") g1 = barabasi_graph(range(100), 2) print("estimated m of g1:", estimate_m(g1, model_correction=True)) g2 = barabasi_graph(range(20), 5) print("estimated m of g2:", estimate_m(g2, model_correction=True)) g3 = barabasi_graph(range(30), 10) print("estimated m of g3:", estimate_m(g3, model_correction=True)) print("estimated m of g1 and g2:", estimate_m(g1, g2, model_correction=True)) print("estimated m of g1 and g3:", estimate_m(g1, g3, model_correction=True)) print("estimated m of g2 and g3:", estimate_m(g2, g3, model_correction=True)) """ """ #testcase edge similarity #create subgraph_1 subgraph_1 = nx.empty_graph(5) subgraph_1.add_edges_from(zip([0, 0, 0, 1, 1], [1, 2, 4, 2, 3])) plt.clf() plt.subplot(1, 3, 1) plt.title("Subgraph 1") nx.draw_circular(subgraph_1) #create subgraph_2 subgraph_2 = nx.empty_graph(0) subgraph_2.add_nodes_from([5, 6, 7]) subgraph_2.add_edges_from(zip([5, 5, 6], [6, 7, 7])) plt.subplot(1, 3, 2) plt.title("Subgraph 2") nx.draw_circular(subgraph_2) graph = subgraph_1.copy() #remove 2 edges of subgraph_1 graph.remove_edge(0, 2) graph.remove_edge(1, 3) #add 1 edge not present in subgraph_1 graph.add_edge(3, 4) #add edges of subgraph_2 graph.add_nodes_from([5, 6, 7]) graph.add_edges_from(zip([5, 5], [6, 7])) #add 2 intergraph edges graph.add_edges_from(zip([2, 2], [5, 6])) edge_similarity(graph, subgraph_1, subgraph_2) # expected output: #3 edges in graph and subgraph_1 #2 edges only in subgraph_1 #1 edge only in graph #4 edges missing in graph and subgraph_1 #2 edges in graph and subgraph_2 #1 edges only in subgraph_2 #0 edge only in graph #0 edges missing in graph and subgraph_2 plt.subplot(1, 3, 3) plt.title("Graph") nx.draw_circular(graph) plt.show() """