#!/usr/bin/python
import os, csv, sys, math, time
import xml.dom.minidom
import networkx

#In this program, associated proteins will include proteins that associated theit children
#The program first make a topology sort of all nodes, then go reversedly merge up any node that
#either pv>threshold_pv or size<threshold_size.

def checkDup(P_list):
	PTp = P_list
	PTp.sort()
	for i in (range(len(PTp)-1)):
		if PTp[i]==PTp[i+1]:
			return 1
	return 0
def hyperGeo_pr(p_x,p_k,p_m,p_n):
#Function to calculate the probability
#input:
#	p_x: number of white balls drawn.
#	p_k: number of balls drawn (note p_x is not larger than p_k).
#	p_m: number of white balls in the urn.
#	p_n: number of black balls in the urn.
#output:
#	the probability that has p_x white balls in a drawing of p_k balls.

	l_nue = []
	l_deno = []
	for j in range(p_x):
		l_nue.append(p_m-j)
		l_deno.append(j+1)
	for j in range(p_k-p_x):
		l_nue.append(p_n-j)
		l_deno.append(j+1)
	for j in range(p_k):
		l_deno.append(p_m+p_n-j)
		l_nue.append(j+1)
	l_deno.sort()
	l_nue.sort()
	#l_p = 1.0
	l_p2 = 0.0

	try:
		for j in range(len(l_deno)):
			#l_p = l_p*l_nue[j]/l_deno[j]
			l_p2 += math.log(l_nue[j]) - math.log(l_deno[j])
	
	except:
		return 0
	
	#return math.exp(l_p)
	return math.exp(l_p2)

def hyperGeo_Pvalue(white_ball_drawn,total_ball_drawn,white_ball_in_urn,total_ball_in_urn):
#Function to calculate the p_value
#input:
#	white_ball_drawn: number of white balls drawn.
#	white_ball_in_urn: number of white balls in the urn.
#	total_ball_in_urn: number of black balls in the urn.
#	total_ball_drawn: number of balls drawn (note white_ball_drawn is not larger than total_ball_drawn).
#output:
#	1-the probability that has at least white_ball_drawn white balls in a drawing of total_ball_drawn balls.

	l_p = 0.0
	for j in range(white_ball_drawn):
		hptp = hyperGeo_pr(j,total_ball_drawn,white_ball_in_urn,total_ball_in_urn-white_ball_in_urn)
		l_p += hptp
	if l_p>1:
		l_p = 1.0
	return (1-l_p)

def Union(P_list1, P_list2):
	L_rets = []
	L_dic = {}
	for item in P_list1:
		L_rets.append(item)
		L_dic[item] = 1
	for item in P_list2:
		if not(L_dic.has_key(item)):
			L_rets.append(item)
	L_rets.sort()
	return L_rets

def Intersection(P_list1, P_list2):
	L_rets = []
	L_dic = {}
	for item in P_list1:
		L_dic[item] = 1
	for item in P_list2:
		if L_dic.has_key(item):
			L_rets.append(item)
	L_rets.sort()
	return L_rets

def Difference(P_list1, P_list2):
	#return P_list2 - P_list1
	L_rets = []
	L_dic = {}
	for item in P_list1:
		L_dic[item] = 1
	for item in P_list2:
		if not(L_dic.has_key(item)):
			L_rets.append(item)
	L_rets.sort()
	return L_rets

class GoGraph(networkx.DiGraph):
	def __init__(self,P_weightGraph, P_Associate):
		networkx.DiGraph.__init__(self)
		self.association = 'Association_proteins'
		self.association_acc = 'Association_Accumulation'
		self.mapping = 'mapping_protiens'
		self.PV = 'P_value'
		self.TotalLost = 0.0
		self.gene2Goterm = self.Gene2GoTermAssociation(P_Associate)
		self.Goterm2gene = self.GoTerm2GeneAssociation(P_Associate)
		self.NumberOfAllProtein = len(self.gene2Goterm)
		self.createDiGraph(P_weightGraph)

	def readWeights(self, P_filename = ''):
	#read edge-weighted go structure data

		xmldoc = xml.dom.minidom.parse(P_filename)
		xmlData = xmldoc.firstChild
		goTermList = xmlData.getElementsByTagName('term')

		goGraphRe = {}
		count =0
		for term in goTermList:
			count += 0
			if count<10:
				go = term.attributes['id']
				goGraphRe[go.value] = {}
				parentList = term.getElementsByTagName('parent')
				for par in parentList:
					parGO = par.attributes['id']
					goGraphRe[go.value][parGO.value] = {}
					edgeList = par.getElementsByTagName('edge')
					for edge in edgeList:
						edgeType = edge.attributes['type']
						goGraphRe[go.value][parGO.value] = float(edge.firstChild.data)
						#print go.value, parGO.value, float(edge.firstChild.data)
		return goGraphRe

	def createDiGraph(self, P_filename):
		'''
		create edge-weighted Goterm structure
		'''
		GoStructure = self.readWeights(P_filename)
		for chtp in GoStructure:
			for pa in GoStructure[chtp]:
				self.add_edge(chtp,pa)
				self.edge[chtp][pa]['weight'] = GoStructure[chtp][pa]
		self.node_AssociationProtienData()
		self.node_Depth()

	def Gene2GoTermAssociation(self, P_fName):
	#create gene-goterm association relation

		L_rets = {}
		L_retNoDup = {}
		L_fileLipid = open(P_fName)
		for line in L_fileLipid:
			line2 = line.split('\t')
			if len(line2)>10:
				if L_rets.has_key(line2[2]):
					L_rets[line2[2]][line2[4]] = 1
				else:
					L_rets[line2[2]] = {line2[4]:1}
		for gene in L_rets:
			GList = []
			for Gtemp in L_rets[gene]:
				GList.append(Gtemp)
			L_retNoDup[gene] = GList
		return L_retNoDup

	def GoTerm2GeneAssociation(self, P_fName):
	#create goterm-gene association relation

		L_rets = {}
		L_retNoDup = {}
		L_fileLipid = open(P_fName)
		for line in L_fileLipid:
			line2 = line.split('\t')
			if len(line2)>10:
				if L_rets.has_key(line2[4]):
					L_rets[line2[4]][line2[2]] = 1
				else:
					L_rets[line2[4]] = {line2[2]:1}
		for Gtp in L_rets:
			GeneList = []
			for gene in L_rets[Gtp]:
				GeneList.append(gene)
			L_retNoDup[Gtp] = GeneList

		return L_retNoDup

	def propagate_AssociatedProteins(self):
		Top_sort_nodes = networkx.topological_sort(self)
		for nodeTp in Top_sort_nodes:
			ParentTp = self.successors(nodeTp)
			AssoSelf = self.node[nodeTp][self.association_acc]
			for nodePare in ParentTp:
				AssoPare = self.node[nodePare][self.association_acc]
				#print nodeTp,nodePare,AssoSelf,AssoPare
				UnionTwo = Union(AssoSelf,AssoPare)
				self.node[nodePare][self.association_acc] = UnionTwo

	def node_AssociationProtienData(self):
		for gotermTp in self.nodes():
			self.node[gotermTp]['level'] = 1000000000
			if self.Goterm2gene.has_key(gotermTp):
				self.node[gotermTp][self.association] = self.Goterm2gene[gotermTp]
				self.node[gotermTp][self.association_acc] = self.Goterm2gene[gotermTp]
			else:
				self.node[gotermTp][self.association] = []
				self.node[gotermTp][self.association_acc] = []
			self.node[gotermTp][self.mapping] = []

		self.propagate_AssociatedProteins()

	def node_Depth(self):
		Top_sort_nodes = networkx.topological_sort(self)
		OrderReverse=[]
		for i in range(len(Top_sort_nodes)):
			OrderReverse.append(Top_sort_nodes[len(Top_sort_nodes)-i-1])
		
		self.node[OrderReverse[0]]['level'] = 1
		for nodeTp in OrderReverse:
			ChildrenTp = self.predecessors(nodeTp)
			for chd in ChildrenTp:
				if self.node[chd]['level']>self.node[nodeTp]['level']+1:
					self.node[chd]['level']=self.node[nodeTp]['level']+1
		
	def symplify_Goterm_Structure(self):
		Top_Order = networkx.topological_sort(self)
		for nodeTp in Top_Order:
			if len(self.node[nodeTp][self.mapping])==0 and len(self.predecessors(nodeTp)) == 0:
				self.remove_node(nodeTp)

	def node_Mapping_ProteinList_PV1(self, P_protein_InList):
	#map gene list to Go Terms, use association to calculate PV

		Gene_List_Associated = []
		for gene in P_protein_InList:
			if self.gene2Goterm.has_key(gene):
				Gene_List_Associated.append(gene)
		self.NumberOfProtein_InList = len(Gene_List_Associated)
		
		for nodeID in self.nodes():
			P_Association=self.node[nodeID][self.association]
			P_protein = Intersection(P_protein_InList,P_Association)
			self.node[nodeID][self.mapping] = P_protein
			newPV = hyperGeo_Pvalue(len(P_protein),self.NumberOfProtein_InList, len(P_Association),self.NumberOfAllProtein)
			self.node[nodeID][self.PV] = newPV
			
			#---
			if checkDup(P_Association)==1:
				print 'Asso dup'
			if checkDup(P_protein_InList)==1:
				print 'PList dup'
			if checkDup(P_protein)==1:
				print 'Intersection dup'

	def node_Mapping_ProteinList_PV2(self, P_protein_InList):
	#map gene list to Go Terms, use association_acc to calculate PV

		Gene_List_Associated = []
		for gene in P_protein_InList:
			if self.gene2Goterm.has_key(gene):
				Gene_List_Associated.append(gene)
		self.NumberOfProtein_InList = len(Gene_List_Associated)

		for nodeID in self.nodes():
			P_Association=self.node[nodeID][self.association]
			P_Association2=self.node[nodeID][self.association_acc]
			P_protein = Intersection(P_protein_InList,P_Association)
			self.node[nodeID][self.mapping] = P_protein
				
			newPV = hyperGeo_Pvalue(len(P_protein),self.NumberOfProtein_InList, len(P_Association2),self.NumberOfAllProtein)
			self.node[nodeID][self.PV] = newPV
			
			#---
			if checkDup(P_Association)==1:
				print 'Asso dup'
			if checkDup(P_protein_InList)==1:
				print 'PList dup'
			if checkDup(P_protein)==1:
				print 'Intersection dup'


	def calculate_lost(self, P_nodeSource, P_nodeTarget):
		edgeWeight = self.edge[P_nodeSource][P_nodeTarget]['weight']
		L_lost = edgeWeight*len(self.node[P_nodeSource][self.mapping])
		#print 'edgeWeight=',edgeWeight, 'sizeOfSource=',len(self.node[P_nodeSource][self.mapping]),'lost=',L_lost
		return L_lost

	def merge_nodes(self, P_nodeSource, P_nodeTarget):
	#merge association protien in source node into target node,
	#update the P_value of the target node, and delete source node from the graph
	#use self.association to calculate PV

		L_lost = self. calculate_lost(P_nodeSource, P_nodeTarget)
		P_Source = self.node[P_nodeSource][self.association]
		P_Target = self.node[P_nodeTarget][self.association]
		self.node[P_nodeTarget][self.association] = Union(P_Source, P_Target)

		#---
		if checkDup(self.node[P_nodeTarget][self.association]) == 1:
			print '---Asso dup'

		P_Source2 = self.node[P_nodeSource][self.mapping]
		P_Target2 = self.node[P_nodeTarget][self.mapping]
		self.node[P_nodeTarget][self.mapping] = Union(P_Source2, P_Target2)

		#---
		if len(self.node[P_nodeTarget][self.mapping])>self.NumberOfProtein_InList:
			print len(P_Source2), len(P_Target2)
		if checkDup(self.node[P_nodeTarget][self.mapping]) ==1:
			print '!!! mapping Dup'

		#print 'All protiens=',self.NumberOfAllProtein, 'All white protien#=',len(self.node[P_nodeTarget][self.association])
		#print 'whiteDraw=',len(self.node[P_nodeTarget][self.mapping]), 'Alldraw = ',self.NumberOfProtein_InList
		newPV = hyperGeo_Pvalue(len(self.node[P_nodeTarget][self.mapping]),self.NumberOfProtein_InList, len(self.node[P_nodeTarget][self.association]),self.NumberOfAllProtein)
		self.node[P_nodeTarget][self.PV] = newPV
		self.TotalLost += L_lost
		Children_S = self.predecessors(P_nodeSource)
		Children_T = self.predecessors(P_nodeTarget)
		C_S_sub_T = Difference(Children_T,Children_S)
		#print 'Children of S not of T=',C_S_sub_T
		for proteinTp in C_S_sub_T:
			self.add_edge(proteinTp,P_nodeTarget)
			self.edge[proteinTp][P_nodeTarget]['weight'] = self.edge[proteinTp][P_nodeSource]['weight'] + self.edge[P_nodeSource][P_nodeTarget]['weight']
		self.remove_node(P_nodeSource)


	def merge_nodes2(self, P_nodeSource, P_nodeTarget):
	#merge association protien in source node into target node,
	#update the P_value of the target node, and delete source node from the graph
	#use self.association_acc to calculate PV

		L_lost = self. calculate_lost(P_nodeSource, P_nodeTarget)
		P_Source = self.node[P_nodeSource][self.association]
		P_Target = self.node[P_nodeTarget][self.association]
		self.node[P_nodeTarget][self.association] = Union(P_Source, P_Target)

		#---
		if checkDup(self.node[P_nodeTarget][self.association]) == 1:
			print '---Asso dup'

		P_Source2 = self.node[P_nodeSource][self.mapping]
		P_Target2 = self.node[P_nodeTarget][self.mapping]
		self.node[P_nodeTarget][self.mapping] = Union(P_Source2, P_Target2)

		#---
		if len(self.node[P_nodeTarget][self.mapping])>self.NumberOfProtein_InList:
			print len(P_Source2), len(P_Target2)
		if checkDup(self.node[P_nodeTarget][self.mapping]) ==1:
			print '!!! mapping Dup'

		#print 'All protiens=',self.NumberOfAllProtein, 'All white protien#=',len(self.node[P_nodeTarget][self.association])
		#print 'whiteDraw=',len(self.node[P_nodeTarget][self.mapping]), 'Alldraw = ',self.NumberOfProtein_InList
		newPV = hyperGeo_Pvalue(len(self.node[P_nodeTarget][self.mapping]),self.NumberOfProtein_InList, len(self.node[P_nodeTarget][self.association_acc]),self.NumberOfAllProtein)
		self.node[P_nodeTarget][self.PV] = newPV
		self.TotalLost += L_lost
		Children_S = self.predecessors(P_nodeSource)
		Children_T = self.predecessors(P_nodeTarget)
		C_S_sub_T = Difference(Children_T,Children_S)
		#print 'Children of S not of T=',C_S_sub_T
		for proteinTp in C_S_sub_T:
			self.add_edge(proteinTp,P_nodeTarget)
			self.edge[proteinTp][P_nodeTarget]['weight'] = self.edge[proteinTp][P_nodeSource]['weight'] + self.edge[P_nodeSource][P_nodeTarget]['weight']
		self.remove_node(P_nodeSource)


	def printResult(self):
		'''
		print 'The total information lost is:', self.TotalLost
		print 'The final goterm and associated protein list:'
		for gotp in self.nodes():
			if len(self.node[gotp][self.mapping])>0:
				print gotp,':',self.node[gotp][self.mapping]
		'''
		L_Goterms = {}
		for gotp in self.nodes():
			if len(self.node[gotp][self.mapping])>0:
				L_Goterms[gotp] = [self.node[gotp][self.mapping],self.node[gotp]['level']]
		return [self.TotalLost,L_Goterms]

	def printResult_Leave(self):
		'''
		print 'The total information lost is:', self.TotalLost
		print 'The final goterm and associated protein list:'
		for gotp in self.nodes():
			if len(self.node[gotp][self.mapping])>0:
				print gotp,':',self.node[gotp][self.mapping]
		'''
		L_Goterms = {}
		for gotp in self.nodes():
			if len(self.node[gotp][self.mapping])>0:
				if len(self.predecessors(gotp)) == 0:
					L_Goterms[gotp] = [self.node[gotp][self.mapping],self.node[gotp]['level']]
		return [self.TotalLost,L_Goterms]


	def gotermSummarization(self, P_geneList, P_value, P_minProteinNo):
	#use P_GotermNumber to summarizate a given list of proteins
	#Topologically sort at first

		self.node_Mapping_ProteinList_PV2(P_geneList)
		self.symplify_Goterm_Structure()
		
		#Topologically sort all node, the first element is a leaf and the last element is the root
		New_Top_sort_nodes = networkx.topological_sort(self)

		#go through all nodes in topological order from leaves to the root.
		for nodeTp_child in New_Top_sort_nodes:
			nodePV = self.node[nodeTp_child][self.PV]
			nodeSize = len(self.node[nodeTp_child][self.mapping])

			#if the current node's pv or size is not constrained by the setting conditions, merge this node to the most close parent node.
			if nodePV>P_value or nodeSize<P_minProteinNo:
				L_parrents = self.successors(nodeTp_child)
				L_MinWeight = 100000;L_MinPa = 'non'
				for nodetp in L_parrents:
					if self.edge[nodeTp_child][nodetp]['weight']<L_MinWeight:
						L_MinWeight = self.edge[nodeTp_child][nodetp]['weight']
						L_MinPa = nodetp
				if L_MinPa == 'non':
					pass
					#print '+++1',nodeTp_child,L_MinPa,L_parrents
				else:
					self.merge_nodes2(nodeTp_child,L_MinPa)


		return self.printResult()


'''
def readExpressionData(p_fName):

	fileN = open(p_fName,'r')
	count = 0
	knock = ''
	ret = {}
	allGene = {}
	L_rets = {}
	L_allGene = []
	for line in fileN:
		lineTp = line.split('\t')
		knock = lineTp[1]; gene = lineTp[2]
		allGene[gene] = 1
		if ret.has_key(knock):
			ret[knock][gene]=1
		else:
			ret[knock] = {gene:1}

	for inst in ret:
		coGenes = []
		for gene in ret[inst]:
			coGenes.append(gene)
		L_rets[inst] = coGenes
	for gene in allGene:
		L_allGene.append(gene)
	return [L_rets,L_allGene]
'''

def printResult(P_goTerm,P_result):
	output = 'Instance>'+P_goTerm
	output += '>'+str(P_result[0])+'\nBegin\n'
	for item in P_result[1]:
		output += item+'>'+str(P_result[1][item][1])+'>'
		for gene in P_result[1][item][0]:
			output += gene+':'
		output += '\n'
	output += 'End\n\n'
	return output


'''
stTime = time.mktime(time.gmtime())

#------------------------------------------------------------------------------------------------------
#Example of using the class GoGraph:

#Data of Go Ontology structure and gene_Goterm association
weightGographData = 'newWeightedPubMedGO.xml'
geneGotermAssociationData = 'gene_association.goa_human_2012'


#Create a GoGraph object (Node: every time you use the gotermSummarization(), you need to create a new object)
G = GoGraph(weightGographData,geneGotermAssociationData)

#A list of genes need to be summarized
GeneList = ['CLEC5A','COL11A1','CRABP2','CXCL10','DCC1','DDAH2','DEFB1','DEPDC1','DKFZp762E1312','DLG7','DNA2L','DSC2','DSC3','ECT2','ENC1','EXOSC5','EYA4','EZH2','FABP5','FAM64A','FGF9','FLJ21963','GFOD1','GGH','GINS1','GLDC','GOLGA8A','GOLT1B','HMMR','HRASLS','HS3ST3A1','IGFBP2','ING4','INHBB','ISG15','ITGB8','KIAA1199','KIF11','KIF14','KIF20A','KIF23','KIF2C','KIFC1','KLK7','KPNA2','KRT6A','LMNB1','MAL','MELK','MFAP2','MGAT4A','MKI67','MRS2L','MUC16','MYO10','NCAPD2','NDC80','NEK2','NETO2','NID2','NLRP2','NMU','NRAS','NRCAM','NUSAP1','OGDHL']


#Using Go term to summarize the list of gene.
Result =  G.gotermSummarization(GeneList,0.05,3)

#0.05 is the threshold of P_Value of the Go term node in final result.
#5 is the minimum number of genes in the Go term node in final result.
#The result has the format: [value_0,{Goterm_1:[a list of genes_1,value_1],Goterm_2:[a list of genes_2,value_2],....}]
# value_0: the total information lost for the summarization.
# Goterm_1: Goterm ID that is used to summarize the given list of gene.
# a list of genes_1: a subset of genes (in the given list of gene) that are annotated by Goterm_1.
# value_1: the level of Goterm_1 on the Go Ontology. The root note is in level 1.



#print the result
print 'Total information lost is', Result[0]
for goterm in Result[1]:
	print 'GO term ID:', goterm, '--------------'
	print 'GO term level:',Result[1][goterm][1]
	print 'Gene list:',Result[1][goterm][0],'\n'

#------------------------------------------------------------------------------

print '------------------------------------------------------------'
endTime = time.mktime(time.gmtime())
print 'Total time =', ((endTime-stTime)/60), 'minutes'
'''