#!/usr/bin/python import os, csv, sys, math, time import xml.dom.minidom import networkx #In this program, associated proteins will include proteins that associated theit children #The program first make a topology sort of all nodes, then go reversedly merge up any node that #either pv>threshold_pv or size1: l_p = 1.0 return (1-l_p) def Union(P_list1, P_list2): L_rets = [] L_dic = {} for item in P_list1: L_rets.append(item) L_dic[item] = 1 for item in P_list2: if not(L_dic.has_key(item)): L_rets.append(item) L_rets.sort() return L_rets def Intersection(P_list1, P_list2): L_rets = [] L_dic = {} for item in P_list1: L_dic[item] = 1 for item in P_list2: if L_dic.has_key(item): L_rets.append(item) L_rets.sort() return L_rets def Difference(P_list1, P_list2): #return P_list2 - P_list1 L_rets = [] L_dic = {} for item in P_list1: L_dic[item] = 1 for item in P_list2: if not(L_dic.has_key(item)): L_rets.append(item) L_rets.sort() return L_rets class GoGraph(networkx.DiGraph): def __init__(self,P_weightGraph, P_Associate): networkx.DiGraph.__init__(self) self.association = 'Association_proteins' self.association_acc = 'Association_Accumulation' self.mapping = 'mapping_protiens' self.PV = 'P_value' self.TotalLost = 0.0 self.gene2Goterm = self.Gene2GoTermAssociation(P_Associate) self.Goterm2gene = self.GoTerm2GeneAssociation(P_Associate) self.NumberOfAllProtein = len(self.gene2Goterm) self.createDiGraph(P_weightGraph) def readWeights(self, P_filename = ''): #read edge-weighted go structure data xmldoc = xml.dom.minidom.parse(P_filename) xmlData = xmldoc.firstChild goTermList = xmlData.getElementsByTagName('term') goGraphRe = {} count =0 for term in goTermList: count += 0 if count<10: go = term.attributes['id'] goGraphRe[go.value] = {} parentList = term.getElementsByTagName('parent') for par in parentList: parGO = par.attributes['id'] goGraphRe[go.value][parGO.value] = {} edgeList = par.getElementsByTagName('edge') for edge in edgeList: edgeType = edge.attributes['type'] goGraphRe[go.value][parGO.value] = float(edge.firstChild.data) #print go.value, parGO.value, float(edge.firstChild.data) return goGraphRe def createDiGraph(self, P_filename): ''' create edge-weighted Goterm structure ''' GoStructure = self.readWeights(P_filename) for chtp in GoStructure: for pa in GoStructure[chtp]: self.add_edge(chtp,pa) self.edge[chtp][pa]['weight'] = GoStructure[chtp][pa] self.node_AssociationProtienData() self.node_Depth() def Gene2GoTermAssociation(self, P_fName): #create gene-goterm association relation L_rets = {} L_retNoDup = {} L_fileLipid = open(P_fName) for line in L_fileLipid: line2 = line.split('\t') if len(line2)>10: if L_rets.has_key(line2[2]): L_rets[line2[2]][line2[4]] = 1 else: L_rets[line2[2]] = {line2[4]:1} for gene in L_rets: GList = [] for Gtemp in L_rets[gene]: GList.append(Gtemp) L_retNoDup[gene] = GList return L_retNoDup def GoTerm2GeneAssociation(self, P_fName): #create goterm-gene association relation L_rets = {} L_retNoDup = {} L_fileLipid = open(P_fName) for line in L_fileLipid: line2 = line.split('\t') if len(line2)>10: if L_rets.has_key(line2[4]): L_rets[line2[4]][line2[2]] = 1 else: L_rets[line2[4]] = {line2[2]:1} for Gtp in L_rets: GeneList = [] for gene in L_rets[Gtp]: GeneList.append(gene) L_retNoDup[Gtp] = GeneList return L_retNoDup def propagate_AssociatedProteins(self): Top_sort_nodes = networkx.topological_sort(self) for nodeTp in Top_sort_nodes: ParentTp = self.successors(nodeTp) AssoSelf = self.node[nodeTp][self.association_acc] for nodePare in ParentTp: AssoPare = self.node[nodePare][self.association_acc] #print nodeTp,nodePare,AssoSelf,AssoPare UnionTwo = Union(AssoSelf,AssoPare) self.node[nodePare][self.association_acc] = UnionTwo def node_AssociationProtienData(self): for gotermTp in self.nodes(): self.node[gotermTp]['level'] = 1000000000 if self.Goterm2gene.has_key(gotermTp): self.node[gotermTp][self.association] = self.Goterm2gene[gotermTp] self.node[gotermTp][self.association_acc] = self.Goterm2gene[gotermTp] else: self.node[gotermTp][self.association] = [] self.node[gotermTp][self.association_acc] = [] self.node[gotermTp][self.mapping] = [] self.propagate_AssociatedProteins() def node_Depth(self): Top_sort_nodes = networkx.topological_sort(self) OrderReverse=[] for i in range(len(Top_sort_nodes)): OrderReverse.append(Top_sort_nodes[len(Top_sort_nodes)-i-1]) self.node[OrderReverse[0]]['level'] = 1 for nodeTp in OrderReverse: ChildrenTp = self.predecessors(nodeTp) for chd in ChildrenTp: if self.node[chd]['level']>self.node[nodeTp]['level']+1: self.node[chd]['level']=self.node[nodeTp]['level']+1 def symplify_Goterm_Structure(self): Top_Order = networkx.topological_sort(self) for nodeTp in Top_Order: if len(self.node[nodeTp][self.mapping])==0 and len(self.predecessors(nodeTp)) == 0: self.remove_node(nodeTp) def node_Mapping_ProteinList_PV1(self, P_protein_InList): #map gene list to Go Terms, use association to calculate PV Gene_List_Associated = [] for gene in P_protein_InList: if self.gene2Goterm.has_key(gene): Gene_List_Associated.append(gene) self.NumberOfProtein_InList = len(Gene_List_Associated) for nodeID in self.nodes(): P_Association=self.node[nodeID][self.association] P_protein = Intersection(P_protein_InList,P_Association) self.node[nodeID][self.mapping] = P_protein newPV = hyperGeo_Pvalue(len(P_protein),self.NumberOfProtein_InList, len(P_Association),self.NumberOfAllProtein) self.node[nodeID][self.PV] = newPV #--- if checkDup(P_Association)==1: print 'Asso dup' if checkDup(P_protein_InList)==1: print 'PList dup' if checkDup(P_protein)==1: print 'Intersection dup' def node_Mapping_ProteinList_PV2(self, P_protein_InList): #map gene list to Go Terms, use association_acc to calculate PV Gene_List_Associated = [] for gene in P_protein_InList: if self.gene2Goterm.has_key(gene): Gene_List_Associated.append(gene) self.NumberOfProtein_InList = len(Gene_List_Associated) for nodeID in self.nodes(): P_Association=self.node[nodeID][self.association] P_Association2=self.node[nodeID][self.association_acc] P_protein = Intersection(P_protein_InList,P_Association) self.node[nodeID][self.mapping] = P_protein newPV = hyperGeo_Pvalue(len(P_protein),self.NumberOfProtein_InList, len(P_Association2),self.NumberOfAllProtein) self.node[nodeID][self.PV] = newPV #--- if checkDup(P_Association)==1: print 'Asso dup' if checkDup(P_protein_InList)==1: print 'PList dup' if checkDup(P_protein)==1: print 'Intersection dup' def calculate_lost(self, P_nodeSource, P_nodeTarget): edgeWeight = self.edge[P_nodeSource][P_nodeTarget]['weight'] L_lost = edgeWeight*len(self.node[P_nodeSource][self.mapping]) #print 'edgeWeight=',edgeWeight, 'sizeOfSource=',len(self.node[P_nodeSource][self.mapping]),'lost=',L_lost return L_lost def merge_nodes(self, P_nodeSource, P_nodeTarget): #merge association protien in source node into target node, #update the P_value of the target node, and delete source node from the graph #use self.association to calculate PV L_lost = self. calculate_lost(P_nodeSource, P_nodeTarget) P_Source = self.node[P_nodeSource][self.association] P_Target = self.node[P_nodeTarget][self.association] self.node[P_nodeTarget][self.association] = Union(P_Source, P_Target) #--- if checkDup(self.node[P_nodeTarget][self.association]) == 1: print '---Asso dup' P_Source2 = self.node[P_nodeSource][self.mapping] P_Target2 = self.node[P_nodeTarget][self.mapping] self.node[P_nodeTarget][self.mapping] = Union(P_Source2, P_Target2) #--- if len(self.node[P_nodeTarget][self.mapping])>self.NumberOfProtein_InList: print len(P_Source2), len(P_Target2) if checkDup(self.node[P_nodeTarget][self.mapping]) ==1: print '!!! mapping Dup' #print 'All protiens=',self.NumberOfAllProtein, 'All white protien#=',len(self.node[P_nodeTarget][self.association]) #print 'whiteDraw=',len(self.node[P_nodeTarget][self.mapping]), 'Alldraw = ',self.NumberOfProtein_InList newPV = hyperGeo_Pvalue(len(self.node[P_nodeTarget][self.mapping]),self.NumberOfProtein_InList, len(self.node[P_nodeTarget][self.association]),self.NumberOfAllProtein) self.node[P_nodeTarget][self.PV] = newPV self.TotalLost += L_lost Children_S = self.predecessors(P_nodeSource) Children_T = self.predecessors(P_nodeTarget) C_S_sub_T = Difference(Children_T,Children_S) #print 'Children of S not of T=',C_S_sub_T for proteinTp in C_S_sub_T: self.add_edge(proteinTp,P_nodeTarget) self.edge[proteinTp][P_nodeTarget]['weight'] = self.edge[proteinTp][P_nodeSource]['weight'] + self.edge[P_nodeSource][P_nodeTarget]['weight'] self.remove_node(P_nodeSource) def merge_nodes2(self, P_nodeSource, P_nodeTarget): #merge association protien in source node into target node, #update the P_value of the target node, and delete source node from the graph #use self.association_acc to calculate PV L_lost = self. calculate_lost(P_nodeSource, P_nodeTarget) P_Source = self.node[P_nodeSource][self.association] P_Target = self.node[P_nodeTarget][self.association] self.node[P_nodeTarget][self.association] = Union(P_Source, P_Target) #--- if checkDup(self.node[P_nodeTarget][self.association]) == 1: print '---Asso dup' P_Source2 = self.node[P_nodeSource][self.mapping] P_Target2 = self.node[P_nodeTarget][self.mapping] self.node[P_nodeTarget][self.mapping] = Union(P_Source2, P_Target2) #--- if len(self.node[P_nodeTarget][self.mapping])>self.NumberOfProtein_InList: print len(P_Source2), len(P_Target2) if checkDup(self.node[P_nodeTarget][self.mapping]) ==1: print '!!! mapping Dup' #print 'All protiens=',self.NumberOfAllProtein, 'All white protien#=',len(self.node[P_nodeTarget][self.association]) #print 'whiteDraw=',len(self.node[P_nodeTarget][self.mapping]), 'Alldraw = ',self.NumberOfProtein_InList newPV = hyperGeo_Pvalue(len(self.node[P_nodeTarget][self.mapping]),self.NumberOfProtein_InList, len(self.node[P_nodeTarget][self.association_acc]),self.NumberOfAllProtein) self.node[P_nodeTarget][self.PV] = newPV self.TotalLost += L_lost Children_S = self.predecessors(P_nodeSource) Children_T = self.predecessors(P_nodeTarget) C_S_sub_T = Difference(Children_T,Children_S) #print 'Children of S not of T=',C_S_sub_T for proteinTp in C_S_sub_T: self.add_edge(proteinTp,P_nodeTarget) self.edge[proteinTp][P_nodeTarget]['weight'] = self.edge[proteinTp][P_nodeSource]['weight'] + self.edge[P_nodeSource][P_nodeTarget]['weight'] self.remove_node(P_nodeSource) def printResult(self): ''' print 'The total information lost is:', self.TotalLost print 'The final goterm and associated protein list:' for gotp in self.nodes(): if len(self.node[gotp][self.mapping])>0: print gotp,':',self.node[gotp][self.mapping] ''' L_Goterms = {} for gotp in self.nodes(): if len(self.node[gotp][self.mapping])>0: L_Goterms[gotp] = [self.node[gotp][self.mapping],self.node[gotp]['level']] return [self.TotalLost,L_Goterms] def printResult_Leave(self): ''' print 'The total information lost is:', self.TotalLost print 'The final goterm and associated protein list:' for gotp in self.nodes(): if len(self.node[gotp][self.mapping])>0: print gotp,':',self.node[gotp][self.mapping] ''' L_Goterms = {} for gotp in self.nodes(): if len(self.node[gotp][self.mapping])>0: if len(self.predecessors(gotp)) == 0: L_Goterms[gotp] = [self.node[gotp][self.mapping],self.node[gotp]['level']] return [self.TotalLost,L_Goterms] def gotermSummarization(self, P_geneList, P_value, P_minProteinNo): #use P_GotermNumber to summarizate a given list of proteins #Topologically sort at first self.node_Mapping_ProteinList_PV2(P_geneList) self.symplify_Goterm_Structure() #Topologically sort all node, the first element is a leaf and the last element is the root New_Top_sort_nodes = networkx.topological_sort(self) #go through all nodes in topological order from leaves to the root. for nodeTp_child in New_Top_sort_nodes: nodePV = self.node[nodeTp_child][self.PV] nodeSize = len(self.node[nodeTp_child][self.mapping]) #if the current node's pv or size is not constrained by the setting conditions, merge this node to the most close parent node. if nodePV>P_value or nodeSize'+str(P_result[1][item][1])+'>' for gene in P_result[1][item][0]: output += gene+':' output += '\n' output += 'End\n\n' return output ''' stTime = time.mktime(time.gmtime()) #------------------------------------------------------------------------------------------------------ #Example of using the class GoGraph: #Data of Go Ontology structure and gene_Goterm association weightGographData = 'newWeightedPubMedGO.xml' geneGotermAssociationData = 'gene_association.goa_human_2012' #Create a GoGraph object (Node: every time you use the gotermSummarization(), you need to create a new object) G = GoGraph(weightGographData,geneGotermAssociationData) #A list of genes need to be summarized GeneList = ['CLEC5A','COL11A1','CRABP2','CXCL10','DCC1','DDAH2','DEFB1','DEPDC1','DKFZp762E1312','DLG7','DNA2L','DSC2','DSC3','ECT2','ENC1','EXOSC5','EYA4','EZH2','FABP5','FAM64A','FGF9','FLJ21963','GFOD1','GGH','GINS1','GLDC','GOLGA8A','GOLT1B','HMMR','HRASLS','HS3ST3A1','IGFBP2','ING4','INHBB','ISG15','ITGB8','KIAA1199','KIF11','KIF14','KIF20A','KIF23','KIF2C','KIFC1','KLK7','KPNA2','KRT6A','LMNB1','MAL','MELK','MFAP2','MGAT4A','MKI67','MRS2L','MUC16','MYO10','NCAPD2','NDC80','NEK2','NETO2','NID2','NLRP2','NMU','NRAS','NRCAM','NUSAP1','OGDHL'] #Using Go term to summarize the list of gene. Result = G.gotermSummarization(GeneList,0.05,3) #0.05 is the threshold of P_Value of the Go term node in final result. #5 is the minimum number of genes in the Go term node in final result. #The result has the format: [value_0,{Goterm_1:[a list of genes_1,value_1],Goterm_2:[a list of genes_2,value_2],....}] # value_0: the total information lost for the summarization. # Goterm_1: Goterm ID that is used to summarize the given list of gene. # a list of genes_1: a subset of genes (in the given list of gene) that are annotated by Goterm_1. # value_1: the level of Goterm_1 on the Go Ontology. The root note is in level 1. #print the result print 'Total information lost is', Result[0] for goterm in Result[1]: print 'GO term ID:', goterm, '--------------' print 'GO term level:',Result[1][goterm][1] print 'Gene list:',Result[1][goterm][0],'\n' #------------------------------------------------------------------------------ print '------------------------------------------------------------' endTime = time.mktime(time.gmtime()) print 'Total time =', ((endTime-stTime)/60), 'minutes' '''