#!/usr/bin/python import os, csv, sys, time, re, math ''' Input: A bipartite graph. Output: A highly dense bipartite subgraph. (sort graph such that chose from high degree vertices first) ''' def sortListReverse(p_list): p_list.sort() L_rets = [] for i in range(len(p_list)): L_rets.append(p_list[len(p_list)-i-1]) return L_rets def Intersection(P_list1, P_list2): L_rets = [] L_dic = {} for item in P_list1: L_dic[item] = 1 for item in P_list2: if L_dic.has_key(item): L_rets.append(item) L_rets.sort() return L_rets class DenseSubGraph(): def __init__(self): ''' Initialize the class object geneList -- a dictionary that each key is an instance ID, such as a deleted gene or a patient's TCGA id, each eletmen is a list of co-expressed genes ''' self.__base = ['a','b','c','d','e'] def __k_subsets_i(self, n, k): ''' Yield each subset of size k from the set of intergers 0 .. n - 1 n -- an integer > 0 k -- an integer > 0 ''' # Validate args if n < 0: raise ValueError('n must be > 0, got n=%d' % n) if k < 0: raise ValueError('k must be > 0, got k=%d' % k) # check base cases if k == 0 or n < k: yield set() elif n == k: yield set(range(n)) else: # Use recursive formula based on binomial coeffecients: # choose(n, k) = choose(n - 1, k - 1) + choose(n - 1, k) for s in self.__k_subsets_i(n - 1, k - 1): s.add(n - 1) yield s for s in self.__k_subsets_i(n - 1, k): yield s def __k_subsets(self, s, k): ''' Yield all subsets of size k from set (or list) s s -- a set or list (any iterable will suffice) k -- an integer > 0 ''' s = list(s) n = len(s) for k_set in self.__k_subsets_i(n, k): yield set([s[i] for i in k_set]) def __makeMatrixSorted(self, P_mRow, P_topColNo): ''' Create a bipartite graph from self.__mInstances using a matrix P_mRow -- the minimum instance number that a gene is repeatedly used ''' L_matrix = [] L_allInstances = [] L_allGenes = [] geneTotal = {} for inst in self.__mInstances: for geneTp in self.__mInstances[inst]: if geneTotal.has_key(geneTp): geneTotal[geneTp] += 1 else: geneTotal[geneTp] = 1 useCt = {} for item in geneTotal: usTp = geneTotal[item] if usTp>=P_mRow: if not useCt.has_key(usTp): useCt[usTp] = [] useCt[usTp].append(item) useKey = [] for item in useCt: useKey.append(item) useKeyReverse = sortListReverse(useKey) for usCt in useKeyReverse: for item in useCt[usCt]: L_allGenes.append(item) ChoseCol = [] if P_topColNo= P_rNo*P_ratio: L_countCol += 1 #L_Score += (P_rNo - 1/(1-P_ratio)*(P_rNo-item) + 0.001) L_Score += (P_rNo - 1/(1-P_ratio)*(P_rNo-item) + 0.001)+math.log(P_rNo) if L_countCol<=p_minGene: L_Score = 0 return L_Score def Find_Subgraph_Sorted(self, geneList,p_miniGeneNo): L_TFAll = {} L_TFlist = [] L_comGenes = [] L_instances = [] L_Score = 0 p_topRowNo=20;p_topColNo=20 self.__mInstances = geneList P_mRow = 3 MatTp = self.__makeMatrixSorted(P_mRow,p_topColNo) instances = range(len(MatTp[0])) ''' print '------------------Matrix' for row in MatTp[2]: print row print instances ''' if len(instances)>=4: ChosenInstTp = [] if len(instances)>p_topRowNo: ChosenInstTp.extend(instances[0:p_topRowNo]) else: ChosenInstTp.extend(instances) subsets = self.__k_subsets(ChosenInstTp,4) for sset in subsets: commonGenesTp = [0]*len(MatTp[1]) subSolution = list(sset) selectPool = range(max(subSolution)+1,len(MatTp[0])) #print subSolution,'\n',selectPool for idx in subSolution: for idx2 in range(len(commonGenesTp)): commonGenesTp[idx2] += MatTp[2][idx][idx2] #print commonGenesTp, '\n-----------------------------' status = 1 ScoreTp = self.__getSubSolutionScore(len(subSolution),commonGenesTp,0.75,p_miniGeneNo) while status>0: status = 0 tempScore = -1 tempRow = -1 for tpRow in selectPool: tempCommonGene = [0]*len(MatTp[1]) for idx in range(len(commonGenesTp)): tempCommonGene[idx] = commonGenesTp[idx] + MatTp[2][tpRow][idx] tempScore2 = self.__getSubSolutionScore(len(subSolution)+1,tempCommonGene,0.75,p_miniGeneNo) if tempScore2 > tempScore: tempScore = tempScore2 tempRow = tpRow if tempScore>ScoreTp: status = 1 ScoreTp = tempScore subSolution.append(tempRow) selectPool.remove(tempRow) for idx in range(len(commonGenesTp)): commonGenesTp[idx] += MatTp[2][tempRow][idx] if ScoreTp > L_Score: L_Score = ScoreTp L_instances = [] for item in subSolution: L_instances.append(MatTp[0][item]) L_comGenes = [] for idx in range(len(commonGenesTp)): if commonGenesTp[idx]>=len(subSolution)*0.75: L_comGenes.append(MatTp[1][idx]) #print 'coexpressed=', L_comGenes #print 'instance=', L_instances #print 'score=',L_Score L_instancesUpdate = [] L_geneName2idx = {} for idx in range(len(MatTp[1])): L_geneName2idx[MatTp[1][idx]] = idx for idxInst in range(len(MatTp[0])): countDegree = 0 for gene in L_comGenes: if MatTp[2][idxInst][L_geneName2idx[gene]]==1: countDegree += 1 if countDegree>=len(L_comGenes)*0.75: L_instancesUpdate.append(MatTp[0][idxInst]) return L_instances,L_comGenes def test(self): subsets = self.__k_subsets(self.__base,2) for item in subsets: print item ''' #Example of using the class DenseSubGraph #Create an object of the class DenseSubGraph Object = DenseSubGraph() #Define a bipartite graph BiGraph = {'RPS24A(**9)': ['AIR1', 'CLN2', 'GLN1', 'IMP2', 'LSB1', 'PGI1', 'PRK1', 'PTK2', 'RGT1', 'RPB5', 'RVS167', 'SRV2', 'SSA1', 'UBX6'], 'FKS1(HAPLOID)': ['ABP1', 'ACT1', 'AKL1', 'ARG80', 'BUL1', 'CLN2', 'GYL1', 'HUA1', 'LAS17', 'LSB1', 'PAN1', 'PTK2', 'RHO1', 'RPC40', 'RVS167', 'SRV2', 'TOR2', 'UBP5', 'UTP9', 'VRP1', 'YPR078C'], 'RPS24A(HAPLOID)': ['AIP1', 'DED1', 'GCN4', 'GLK1', 'GYP5', 'MCM1', 'PTK2', 'SAC6', 'SLA2', 'SSA1', 'YSC84'], 'KAR2(TET PROMOTER)': ['ABP1', 'ACT1', 'AIR1', 'AKL1', 'CBK1', 'CLN2', 'CLN3', 'CRN1', 'GTS1', 'GYP5', 'HRD1', 'HUA1', 'IMP2', 'LSB1', 'MMS2', 'MSB1', 'MYO3', 'PAN1', 'PTK2', 'RGT1', 'RPB5', 'RPC40', 'RUP1', 'RVS167', 'SAC6', 'SRV2', 'TWF1', 'UBP5', 'UTP9', 'YSC84'], 'FKS1(TET PROMOTER)': ['ABP1', 'CLN2', 'CLN3', 'FKS1', 'LSB1', 'MYO3', 'PTK2', 'SCD5', 'SRV2', 'UBP5', 'YPR078C'], 'CDC42(TET PROMOTER)': ['ABP1', 'ACT1', 'AKL1', 'APC5', 'ARC15', 'BEM2', 'BUL1', 'CBK1', 'CDC3', 'CDC42', 'CLN2', 'CLN3', 'CRN1', 'FCY2', 'FKS1', 'HUA1', 'LSB1', 'MCM6', 'MMS2', 'MYO3', 'PRO3', 'PTK2', 'RPB3', 'RPB5', 'RPC40', 'SCP1', 'SLA2', 'SLG1', 'SRV2', 'STE12', 'SUC2', 'TWF1', 'UBP5', 'UTP9', 'YPR078C', 'YSC84'], 'SHE4': ['ACT1', 'AKL1', 'ARC15', 'ARC18', 'ARC35', 'ARC40', 'CAP1', 'CAP2', 'CLN2', 'DUN1', 'ENT1', 'ERB1', 'FCY2', 'FKS1', 'GYP5', 'HXT6', 'MCM6', 'MMS2', 'MYO3', 'PHO84', 'RHO1', 'RPA43', 'RPC40', 'RPO21', 'RVS161', 'SAC6', 'SCD5', 'SLA2', 'STT4', 'TWF1', 'UBP5', 'UBX6', 'UTP9', 'VRP1', 'YPR078C'], 'GLUCOSAMINE': ['ABP1', 'ARG80', 'CRN1', 'HXT6', 'PGI1', 'PRO3', 'PTK2', 'SRV2', 'UBP5', 'VRP1', 'YPR078C'], 'RPL12A': ['ABP1', 'CLN2', 'CLN3', 'CRN1', 'FCY2', 'GCN4', 'GTS1', 'HUA1', 'LOT6', 'PTK2', 'RVS167', 'SLA2', 'SRV2', 'UTP9']} #Set the minimum number of genes in the solution minGeneNo = 6 #Find the highly dense subgrpah Result = Object.Find_Subgraph_Sorted(BiGraph,minGeneNo) #print Result print 'Nodes in perturbation set:',Result[0] print 'Nodes in gene set:',Result[1] '''