import sys, gzip, Queue

def int_of_ip(ip):
        r = 0
        for x in ip.split('.'):
                r *= 256
                r += int(x)
        return(r)

def read_tree(filename):
        f = gzip.open(filename)
        t = {}
        for l in f:
                [son,father] = l.split()
                t[son] = father
        return(t)

def read_ip_tree(filename):
        f = gzip.open(filename)
        t = {}
        for l in f:
                [son,father] = l.split()
		if son[0].isdigit() and father[0].isdigit():
                	t[int_of_ip(son)] = int_of_ip(father)
        return(t)

def connected_components(g):
	visited = {}
        for v in g.iterkeys():
		visited[v] = 0
	n_cc = 0
	q = Queue.Queue()
	for v in g.iterkeys():
		if visited[v] == 0:
			n_cc += 1
			q.put(v)
			visited[v] = n_cc
			while not q.empty():
				v = q.get()
                             	for u in g[v]:
					if visited[u] == 0:
						visited[u] = n_cc
						q.put(u)
	cc_size_nodes = [0 for i in xrange(n_cc)]
	for v in g.iterkeys():
		cc_size_nodes[visited[v]-1] += 1
	cc_size_links = [0 for i in xrange(n_cc)]
	for v in g.iterkeys():
		for u in g[v]:
			cc_size_links[visited[v]-1] += 1
	for i in xrange(n_cc):
		cc_size_links[i] /= 2
	if n_cc>0:
		size_largest_cc = max(cc_size_nodes)
	else:
		size_largest_cc = None
	return(n_cc,size_largest_cc,cc_size_nodes,cc_size_links)

def distance(g,x,y):
	res = {}
	if x not in g or y not in g:
		return(-2)
        for v in g.iterkeys():
		res[v] = -1
	q = Queue.Queue()
	q.put(x)
	res[x] = 0
	while not q.empty():
		v = q.get()
              	for u in g[v]:
			if res[u] == -1:
				res[u] = res[v] + 1
				if u == y:
					return(res[u])
				q.put(u)
	return(res[y])

def output_cc(f,i,(n_cc,size_largest_cc,cc_size_nodes,cc_size_links)):
	for x in xrange(n_cc):
		n = cc_size_nodes[x]
		m = cc_size_links[x]
		avg_deg = 0
		if n>0:
			avg_deg = (2.*m)/n
		density = 0
		if n>1:
			density = (2.*m)/(n*(n-1.))
		f.write("%d %d %d %d %d %f %f\n"%(i,n_cc,size_largest_cc,n,m,avg_deg,density))
	f.flush()

def make_subgraph(nodes,links):
	g = {}
	for x in nodes:
		g[x] = set()
	for x,y in links:
		if x in nodes and y in nodes:
			g[x].add(y)
			g[y].add(x)
	return(g)

def make_subgraph_d1(nodes,links):
	g = {}
	for x in nodes:
		g[x] = set()
	for x,y in links:
		if x in nodes or y in nodes:
			g[x] = set()
			g[y] = set()
	for x,y in links:
		if x in nodes or y in nodes:
			g[x].add(y)
			g[y].add(x)
	return(g)

if __name__ == '__main__':
	if len(sys.argv) != 5:
		sys.stderr.write("Usage: %s input_dir begin end out_dir\n"%sys.argv[0])
		sys.exit(1)
	input_dir = sys.argv[1]+"/"
	begin = int(sys.argv[2])
	end = int(sys.argv[3])
	out_dir = sys.argv[4]+'/'

	intervals = set()
	l = [1,2,3,5,10,50,100]
	for pre in l:
		for cur in l:
			intervals.add((pre,cur))
	intervals = set()
	intervals.add((5,1))
	intervals.add((10,1))
	intervals.add((50,1))
	intervals.add((5,2))
	intervals.add((10,2))
	intervals.add((50,2))
	intervals.add((5,5))
	intervals.add((10,5))
	intervals.add((50,5))

	pre_set = set([x for (x,y) in intervals])
	cur_set = set([y for (x,y) in intervals])
	max_pre = max(pre_set)
	max_cur = max(cur_set)

	sys.stderr.write("max_pre = %d  max_cur = %d\n"%(max_pre,max_cur))
	sys.stderr.flush()

	# we will keep max_pre+max_cur steps in memory
	nodes = [set() for i in xrange(max_pre+max_cur)]
	links = [set() for i in xrange(max_pre+max_cur)]
	directed_links = [set() for i in xrange(max_pre+max_cur)]
	all_nodes = set()
	all_links = set()

	# we will create two files for each (x,y) in intervals
	out_file = {}
	out_dist_file = {}
	out_cc_file_Up_Uc = {}
	out_cc_file_Uc_Up = {}
	out_cc_file_Ic_Up = {}
	out_cc_file_Ip_Uc = {}
	out_cc_file_Up_Uc_d1 = {}
	out_cc_file_Uc_Up_d1 = {}
	out_cc_file_Ic_Up_d1 = {}
	out_cc_file_Ip_Uc_d1 = {}
	for (pre,cur) in intervals:
			out = gzip.open(out_dir+"out_"+str(pre)+"_"+str(cur)+".gz","w")
			out_dist = gzip.open(out_dir+"out_dist_"+str(pre)+"_"+str(cur)+".gz","w")
			out_cc_Up_Uc = gzip.open(out_dir+"cc_Up_Uc_"+str(pre)+"_"+str(cur)+".gz","w")
			out_cc_Uc_Up = gzip.open(out_dir+"cc_Uc_Up_"+str(pre)+"_"+str(cur)+".gz","w")
			out_cc_Ic_Up = gzip.open(out_dir+"cc_Ic_Up_"+str(pre)+"_"+str(cur)+".gz","w")
			out_cc_Ip_Uc = gzip.open(out_dir+"cc_Ip_Uc_"+str(pre)+"_"+str(cur)+".gz","w")
			out_cc_Up_Uc_d1 = gzip.open(out_dir+"cc_Up_Uc_d1_"+str(pre)+"_"+str(cur)+".gz","w")
			out_cc_Uc_Up_d1 = gzip.open(out_dir+"cc_Uc_Up_d1_"+str(pre)+"_"+str(cur)+".gz","w")
			out_cc_Ic_Up_d1 = gzip.open(out_dir+"cc_Ic_Up_d1_"+str(pre)+"_"+str(cur)+".gz","w")
			out_cc_Ip_Uc_d1 = gzip.open(out_dir+"cc_Ip_Uc_d1_"+str(pre)+"_"+str(cur)+".gz","w")

			out.write("# 1: i = index of curent round\n")
			out.write("# 2: total nb nodes since beginning\n")
			out.write("# 3: total nb links since beginning\n")
			out.write("# 4: nb nodes in round i\n")
			out.write("# 5: nb links in round i\n")
			out.write("#\n")
			out.write("# 6: number of nodes in the union (Up) pre=%d previous rounds (from i-pre-1 to i-1)\n"%pre)
			out.write("# 7: ---------------------------- (Uc) cur=%d current  ------ (from i to i+cur-1)\n"%cur)
			out.write("# 8: ---------------------- intersection (Ip) pre=%d previous rounds (from i-pre-1 to i-1)\n"%pre)
			out.write("# 9: ----------------------------------- (Ic) cur=%d current  ------ (from i to i+cur-1)\n"%cur)
			out.write("#\n")
			out.write("#10: number of appearing nodes (ie in Uc and not in Up)\n")
			out.write("#11: number of disappearing nodes (ie in Up and not in Uc)\n")
			out.write("#12: number of appearing stable nodes (ie in Ic and not in Up)\n")
			out.write("#13: number of disappearing stable nodes (ie in Ip and not in Uc)\n")
			out.write("#\n")
			out.write("#14-21: same as 6-13 but with links\n")
			out.write("#\n")
			out.write("#22-: border stats\n")
			out.write("#\n")
			out.flush()

			out_dist.write("# each line: step number, number of appearing linkes, distances between extremities of one of them\n")
			out_dist.flush()

			for f in [out_cc_Up_Uc,out_cc_Uc_Up,out_cc_Ic_Up,out_cc_Ip_Uc,out_cc_Up_Uc_d1,out_cc_Uc_Up_d1,out_cc_Ic_Up_d1,out_cc_Ip_Uc_d1]:
				f.write("# 1: i = index of curent round\n")
				f.write("# 2: number of connected components\n")
				f.write("# 3: size of the largest one\n")
				f.write("# 4-...: size of each connected component\n")
				f.flush()

			out_file[pre,cur] = out
			out_dist_file[pre,cur] = out_dist
			out_cc_file_Up_Uc[pre,cur] = out_cc_Up_Uc
			out_cc_file_Uc_Up[pre,cur] = out_cc_Uc_Up
			out_cc_file_Ic_Up[pre,cur] = out_cc_Ic_Up
			out_cc_file_Ip_Uc[pre,cur] = out_cc_Ip_Uc
			out_cc_file_Up_Uc_d1[pre,cur] = out_cc_Up_Uc_d1
			out_cc_file_Uc_Up_d1[pre,cur] = out_cc_Uc_Up_d1
			out_cc_file_Ic_Up_d1[pre,cur] = out_cc_Ic_Up_d1
			out_cc_file_Ip_Uc_d1[pre,cur] = out_cc_Ip_Uc_d1

	step = begin
	i = 0
	while step < end:

		sys.stderr.write(str(step)+" ")
		sys.stderr.flush()
		t = read_ip_tree(input_dir+"%d.out.gz"%step)

		nodes[i%(max_cur+max_pre)] = set(t.keys())
		links[i%(max_cur+max_pre)] = set()
		directed_links[i%(max_cur+max_pre)] = set()
		for x in nodes[i%(max_cur+max_pre)]:
			if t[x] in nodes[i%(max_cur+max_pre)]:
				links[i%(max_cur+max_pre)].add((x,t[x]))
				links[i%(max_cur+max_pre)].add((t[x],x))
				directed_links[i%(max_cur+max_pre)].add((t[x],x))

		all_nodes.update(nodes[(i-max_cur+1)%(max_cur+max_pre)])
		all_links.update(links[(i-max_cur+1)%(max_cur+max_pre)])


		# compute the unions and intersections for each _current_ slice size k

		tmp_union_nodes = set()
		tmp_inter_nodes = nodes[(i-max_cur+1)%(max_cur+max_pre)].copy()
		tmp_union_links = set()
		tmp_inter_links = links[(i-max_cur+1)%(max_cur+max_pre)].copy()
		tmp_union_directed_links = set()
		tmp_inter_directed_links = directed_links[(i-max_cur+1)%(max_cur+max_pre)].copy()

		union_cur_nodes = {}
		inter_cur_nodes = {}
		union_cur_links = {}
		inter_cur_links = {}
		union_cur_directed_links = {}
		inter_cur_directed_links = {}

		for k in xrange(max_cur):
			tmp_union_nodes.update(nodes[(i-max_cur+k+1)%(max_cur+max_pre)])
			tmp_inter_nodes.intersection_update(nodes[(i-max_cur+k+1)%(max_cur+max_pre)])
			tmp_union_links.update(links[(i-max_cur+k+1)%(max_cur+max_pre)])
			tmp_inter_links.intersection_update(links[(i-max_cur+k+1)%(max_cur+max_pre)])
			tmp_union_directed_links.update(directed_links[(i-max_cur+k+1)%(max_cur+max_pre)])
			tmp_inter_directed_links.intersection_update(directed_links[(i-max_cur+k+1)%(max_cur+max_pre)])
			if k+1 in cur_set:
				union_cur_nodes[k+1] = tmp_union_nodes.copy()
				inter_cur_nodes[k+1] = tmp_inter_nodes.copy()
				union_cur_links[k+1] = tmp_union_links.copy()
				inter_cur_links[k+1] = tmp_inter_links.copy()
				union_cur_directed_links[k+1] = tmp_union_directed_links.copy()
				inter_cur_directed_links[k+1] = tmp_inter_directed_links.copy()


		# compute the unions and intersections for each _previous_ slice size k

		tmp_union_nodes = set()
		tmp_inter_nodes = nodes[(i-max_cur)%(max_cur+max_pre)].copy()
		tmp_union_links = set()
		tmp_inter_links = links[(i-max_cur)%(max_cur+max_pre)].copy()
		tmp_union_directed_links = set()
		tmp_inter_directed_links = directed_links[(i-max_cur)%(max_cur+max_pre)].copy()

		union_pre_nodes = {}
		inter_pre_nodes = {}
		union_pre_links = {}
		inter_pre_links = {}
		union_pre_directed_links = {}
		inter_pre_directed_links = {}

		for k in xrange(max_pre):
			tmp_union_nodes.update(nodes[(i-max_cur-k)%(max_cur+max_pre)])
			tmp_inter_nodes.intersection_update(nodes[(i-max_cur-k)%(max_cur+max_pre)])
			tmp_union_links.update(links[(i-max_cur-k)%(max_cur+max_pre)])
			tmp_inter_links.intersection_update(links[(i-max_cur-k)%(max_cur+max_pre)])
			tmp_union_directed_links.update(directed_links[(i-max_cur-k)%(max_cur+max_pre)])
			tmp_inter_directed_links.intersection_update(directed_links[(i-max_cur-k)%(max_cur+max_pre)])
			if k+1 in pre_set:
				union_pre_nodes[k+1] = tmp_union_nodes.copy()
				inter_pre_nodes[k+1] = tmp_inter_nodes.copy()
				union_pre_links[k+1] = tmp_union_links.copy()
				inter_pre_links[k+1] = tmp_inter_links.copy()
				union_pre_directed_links[k+1] = tmp_union_directed_links.copy()
				inter_pre_directed_links[k+1] = tmp_inter_directed_links.copy()


		# compute the statistics for each given interval

		for (pre,cur) in intervals:

			out = out_file[pre,cur]
			out_dist = out_dist_file[pre,cur]

			# basics
			out.write("%d %d %d %d %d"%(step-max_cur+1,len(all_nodes),len(all_links)/2,len(nodes[(i-max_cur+1)%(max_cur+max_pre)]),len(links[(i-max_cur+1)%(max_cur+max_pre)])/2))

			# node dynamics

			out.write(" %d %d %d %d"%(len(union_pre_nodes[pre]),len(union_cur_nodes[cur]),len(inter_pre_nodes[pre]),len(inter_cur_nodes[cur])))

			appearing_nodes = union_cur_nodes[cur].difference(union_pre_nodes[pre])
			disappearing_nodes = union_pre_nodes[pre].difference(union_cur_nodes[cur])
			appearing_stable_nodes = inter_cur_nodes[cur].difference(union_pre_nodes[pre])
			disappearing_stable_nodes = inter_pre_nodes[pre].difference(union_cur_nodes[cur])
			out.write(" %d %d %d %d"%(len(appearing_nodes),len(disappearing_nodes),len(appearing_stable_nodes),len(disappearing_stable_nodes)))

			# link dynamics

			out.write(" %d %d %d %d"%(len(union_pre_links[pre]),len(union_cur_links[cur]),len(inter_pre_links[pre]),len(inter_cur_links[cur])))

			appearing_links = union_cur_links[cur].difference(union_pre_links[pre])
			disappearing_links = union_pre_links[pre].difference(union_cur_links[cur])
			appearing_stable_links = inter_cur_links[cur].difference(union_pre_links[pre])
			disappearing_stable_links = inter_pre_links[pre].difference(union_cur_links[cur])
			out.write(" %d %d %d %d"%(len(appearing_links),len(disappearing_links),len(appearing_stable_links),len(disappearing_stable_links)))
			directed_appearing_links = union_cur_directed_links[cur].difference(union_pre_directed_links[pre])
			directed_disappearing_links = union_pre_directed_links[pre].difference(union_cur_directed_links[cur])

			# border nodes (/links ?)

			for (tmp_links,tmp_nodes) in [(appearing_links,appearing_nodes), (disappearing_links,disappearing_nodes), (directed_appearing_links,appearing_nodes), (directed_disappearing_links,disappearing_nodes)]:
				out_border_nodes = set()
				in_border_nodes = set()
				for (x,y) in tmp_links:
					if x not in tmp_nodes and y in tmp_nodes:
						out_border_nodes.add(x)
						in_border_nodes.add(y)
				out.write(" %d %d"%(len(out_border_nodes),len(in_border_nodes)))

			out.write("\n")
			out.flush()

			# distance between extremities of appearing links

			g = make_subgraph(union_pre_nodes[pre],union_pre_links[pre])
			for (u,v) in directed_appearing_links:
				out_dist.write("%d %d %d\n"%(step,len(directed_appearing_links),distance(g,u,v)))
			out_dist.flush()

			# connected components for Up_Uc

			for file,sub in [(out_cc_file_Up_Uc,make_subgraph), (out_cc_file_Up_Uc_d1,make_subgraph_d1)]:
				output_cc(file[pre,cur], step-max_cur+1, connected_components(sub(disappearing_nodes,disappearing_links)))

			# connected components for Uc_Up

			for file,sub in [(out_cc_file_Uc_Up,make_subgraph), (out_cc_file_Uc_Up_d1,make_subgraph_d1)]:
				output_cc(file[pre,cur], step-max_cur+1, connected_components(sub(appearing_nodes,appearing_links)))

			# connected components for Ic_Up

			for file,sub in [(out_cc_file_Ic_Up,make_subgraph), (out_cc_file_Ic_Up_d1,make_subgraph_d1)]:
				output_cc(file[pre,cur], step-max_cur+1, connected_components(sub(appearing_stable_nodes,appearing_links)))

			# connected components for Ip_Uc

			for file,sub in [(out_cc_file_Ip_Uc,make_subgraph), (out_cc_file_Ip_Uc_d1,make_subgraph_d1)]:
				output_cc(file[pre,cur], step-max_cur+1, connected_components(sub(disappearing_stable_nodes,disappearing_links)))

		i += 1
		sys.stdout.write("%d "%i)
		sys.stdout.flush()
		step += 1

	sys.stderr.write("\n")
	sys.stderr.flush()

