#!/usr/bin/python

import os, csv, sys, time, re, math

'''
Input: A bipartite graph.
Output: A highly dense bipartite subgraph. (sort graph such that chose from high degree vertices first)
'''

def sortListReverse(p_list):
	p_list.sort()
	L_rets = []
	for i in range(len(p_list)):
		L_rets.append(p_list[len(p_list)-i-1])
	return L_rets

def Intersection(P_list1, P_list2):
	L_rets = []
	L_dic = {}
	for item in P_list1:
		L_dic[item] = 1
	for item in P_list2:
		if L_dic.has_key(item):
			L_rets.append(item)
	L_rets.sort()
	return L_rets

class DenseSubGraph():
	def __init__(self):
		'''
		Initialize the class object
		geneList -- a dictionary that each key is an instance ID, such as a deleted gene or a patient's TCGA id, each eletmen is a list of co-expressed genes
		'''

		self.__base = ['a','b','c','d','e']


	def __k_subsets_i(self, n, k):
		'''
		Yield each subset of size k from the set of intergers 0 .. n - 1
		n -- an integer > 0
		k -- an integer > 0
		'''
		# Validate args
		if n < 0:
			raise ValueError('n must be > 0, got n=%d' % n)
		if k < 0:
			raise ValueError('k must be > 0, got k=%d' % k)
		# check base cases
		if k == 0 or n < k:
			yield set()
		elif n == k:
			yield set(range(n))

		else:
			# Use recursive formula based on binomial coeffecients:
			# choose(n, k) = choose(n - 1, k - 1) + choose(n - 1, k)
			for s in self.__k_subsets_i(n - 1, k - 1):
				s.add(n - 1)
				yield s
			for s in self.__k_subsets_i(n - 1, k):
				yield s

	def __k_subsets(self, s, k):
		'''
		Yield all subsets of size k from set (or list) s
		s -- a set or list (any iterable will suffice)
		k -- an integer > 0
		'''
		s = list(s)
		n = len(s)
		for k_set in self.__k_subsets_i(n, k):
			yield set([s[i] for i in k_set])

	def __makeMatrixSorted(self, P_mRow, P_topColNo):
		'''
		Create a bipartite graph from self.__mInstances using a matrix
		P_mRow -- the minimum instance number that a gene is repeatedly used
		'''
		L_matrix = []
		L_allInstances = []
		L_allGenes = []
		geneTotal = {}
		for inst in self.__mInstances:
			for geneTp in self.__mInstances[inst]:
				if geneTotal.has_key(geneTp):
					geneTotal[geneTp] += 1
				else:
					geneTotal[geneTp] = 1
		useCt = {}
		for item in geneTotal:
			usTp = geneTotal[item]
			if usTp>=P_mRow:
				if not useCt.has_key(usTp):
					useCt[usTp] = []
				useCt[usTp].append(item)
		useKey = []
		for item in useCt:
			useKey.append(item)
		useKeyReverse = sortListReverse(useKey)
		for usCt in useKeyReverse:
			for item in useCt[usCt]:
				L_allGenes.append(item)
		ChoseCol = []
		if P_topColNo<len(L_allGenes):
			ChoseCol.extend(L_allGenes[0:P_topColNo])
		else:
			ChoseCol.extend(L_allGenes)
		
		UseRowCt = {}
		for inst in self.__mInstances:
			usTp = len(Intersection(self.__mInstances[inst],ChoseCol))
			if not UseRowCt.has_key(usTp):
				UseRowCt[usTp] = []
			UseRowCt[usTp].append(inst)
		useKey = []
		for item in UseRowCt:
			useKey.append(item)
		useKeyReverse = sortListReverse(useKey)
		for usCt in useKeyReverse:
			for item in UseRowCt[usCt]:
				L_allInstances.append(item)


		for inst in L_allInstances:
			geneSub = {}
			for geneTp in self.__mInstances[inst]:
				geneSub[geneTp] = 1
			matRow = []
			for geneTp in L_allGenes:
				if geneSub.has_key(geneTp):
					matRow.append(1)
				else:
					matRow.append(0)
			L_matrix.append(matRow)
		return L_allInstances,L_allGenes,L_matrix

	def __getSubSolutionScore(self, P_rNo, P_record, P_ratio,p_minGene):
		'''
		Calculate the score of a subsolution
		P_rNo -- the number of instances
		P_record -- a list to remember the number that a gene has been used
		'''
		L_Score = 0
		L_countCol = 0
		for item in P_record:
			if item >= P_rNo*P_ratio:
				L_countCol += 1
				#L_Score += (P_rNo - 1/(1-P_ratio)*(P_rNo-item) + 0.001)
				L_Score += (P_rNo - 1/(1-P_ratio)*(P_rNo-item) + 0.001)+math.log(P_rNo)
		if L_countCol<=p_minGene:
			L_Score = 0
		return L_Score

	def Find_Subgraph_Sorted(self, geneList,p_miniGeneNo):
		L_TFAll = {}
		L_TFlist = []
		L_comGenes = []
		L_instances = []
		L_Score = 0

                p_topRowNo=20;p_topColNo=20
		self.__mInstances = geneList
		P_mRow = 3
		MatTp = self.__makeMatrixSorted(P_mRow,p_topColNo)
		instances = range(len(MatTp[0]))

		'''
		print '------------------Matrix'
		for row in MatTp[2]:
			print row
		print instances
		'''
		
		if len(instances)>=4:
			ChosenInstTp = []
			if len(instances)>p_topRowNo:
				ChosenInstTp.extend(instances[0:p_topRowNo])
			else:
				ChosenInstTp.extend(instances)
			subsets = self.__k_subsets(ChosenInstTp,4)
			for sset in subsets:
				commonGenesTp = [0]*len(MatTp[1])
				subSolution = list(sset)
				selectPool = range(max(subSolution)+1,len(MatTp[0]))
				#print subSolution,'\n',selectPool
				for idx in subSolution:
					for idx2 in range(len(commonGenesTp)):
						commonGenesTp[idx2] += MatTp[2][idx][idx2]
				#print commonGenesTp, '\n-----------------------------'

				status = 1
				ScoreTp = self.__getSubSolutionScore(len(subSolution),commonGenesTp,0.75,p_miniGeneNo)
				while status>0:
					status = 0
					tempScore = -1
					tempRow = -1
					for tpRow in selectPool:
						tempCommonGene = [0]*len(MatTp[1])
						for idx in range(len(commonGenesTp)):
							tempCommonGene[idx] = commonGenesTp[idx] + MatTp[2][tpRow][idx]
						tempScore2 = self.__getSubSolutionScore(len(subSolution)+1,tempCommonGene,0.75,p_miniGeneNo)
						if tempScore2 > tempScore:
							tempScore = tempScore2
							tempRow = tpRow
					if tempScore>ScoreTp:
						status = 1
						ScoreTp = tempScore
						subSolution.append(tempRow)
						selectPool.remove(tempRow)
						for idx in range(len(commonGenesTp)):
							commonGenesTp[idx] += MatTp[2][tempRow][idx]
				if ScoreTp > L_Score:
					L_Score = ScoreTp
					L_instances = []
					for item in subSolution:
						L_instances.append(MatTp[0][item])
					L_comGenes = []
					for idx in range(len(commonGenesTp)):
						if commonGenesTp[idx]>=len(subSolution)*0.75:
							L_comGenes.append(MatTp[1][idx])
		#print 'coexpressed=', L_comGenes
		#print 'instance=', L_instances
		#print 'score=',L_Score

		L_instancesUpdate = []
		L_geneName2idx = {}
		for idx in range(len(MatTp[1])):
			L_geneName2idx[MatTp[1][idx]] = idx
		for idxInst in range(len(MatTp[0])):
			countDegree = 0
			for gene in L_comGenes:
				if MatTp[2][idxInst][L_geneName2idx[gene]]==1:
					countDegree += 1
			if countDegree>=len(L_comGenes)*0.75:
				L_instancesUpdate.append(MatTp[0][idxInst])
				

		return L_instances,L_comGenes


	def test(self):
		subsets = self.__k_subsets(self.__base,2)
		for item in subsets:
			print item


'''
#Example of using the class DenseSubGraph

#Create an object of the class DenseSubGraph
Object = DenseSubGraph()

#Define a bipartite graph
BiGraph = {'RPS24A(**9)': ['AIR1', 'CLN2', 'GLN1', 'IMP2', 'LSB1', 'PGI1', 'PRK1', 'PTK2', 'RGT1', 'RPB5', 'RVS167', 'SRV2', 'SSA1', 'UBX6'], 'FKS1(HAPLOID)': ['ABP1', 'ACT1', 'AKL1', 'ARG80', 'BUL1', 'CLN2', 'GYL1', 'HUA1', 'LAS17', 'LSB1', 'PAN1', 'PTK2', 'RHO1', 'RPC40', 'RVS167', 'SRV2', 'TOR2', 'UBP5', 'UTP9', 'VRP1', 'YPR078C'], 'RPS24A(HAPLOID)': ['AIP1', 'DED1', 'GCN4', 'GLK1', 'GYP5', 'MCM1', 'PTK2', 'SAC6', 'SLA2', 'SSA1', 'YSC84'], 'KAR2(TET PROMOTER)': ['ABP1', 'ACT1', 'AIR1', 'AKL1', 'CBK1', 'CLN2', 'CLN3', 'CRN1', 'GTS1', 'GYP5', 'HRD1', 'HUA1', 'IMP2', 'LSB1', 'MMS2', 'MSB1', 'MYO3', 'PAN1', 'PTK2', 'RGT1', 'RPB5', 'RPC40', 'RUP1', 'RVS167', 'SAC6', 'SRV2', 'TWF1', 'UBP5', 'UTP9', 'YSC84'], 'FKS1(TET PROMOTER)': ['ABP1', 'CLN2', 'CLN3', 'FKS1', 'LSB1', 'MYO3', 'PTK2', 'SCD5', 'SRV2', 'UBP5', 'YPR078C'], 'CDC42(TET PROMOTER)': ['ABP1', 'ACT1', 'AKL1', 'APC5', 'ARC15', 'BEM2', 'BUL1', 'CBK1', 'CDC3', 'CDC42', 'CLN2', 'CLN3', 'CRN1', 'FCY2', 'FKS1', 'HUA1', 'LSB1', 'MCM6', 'MMS2', 'MYO3', 'PRO3', 'PTK2', 'RPB3', 'RPB5', 'RPC40', 'SCP1', 'SLA2', 'SLG1', 'SRV2', 'STE12', 'SUC2', 'TWF1', 'UBP5', 'UTP9', 'YPR078C', 'YSC84'], 'SHE4': ['ACT1', 'AKL1', 'ARC15', 'ARC18', 'ARC35', 'ARC40', 'CAP1', 'CAP2', 'CLN2', 'DUN1', 'ENT1', 'ERB1', 'FCY2', 'FKS1', 'GYP5', 'HXT6', 'MCM6', 'MMS2', 'MYO3', 'PHO84', 'RHO1', 'RPA43', 'RPC40', 'RPO21', 'RVS161', 'SAC6', 'SCD5', 'SLA2', 'STT4', 'TWF1', 'UBP5', 'UBX6', 'UTP9', 'VRP1', 'YPR078C'], 'GLUCOSAMINE': ['ABP1', 'ARG80', 'CRN1', 'HXT6', 'PGI1', 'PRO3', 'PTK2', 'SRV2', 'UBP5', 'VRP1', 'YPR078C'], 'RPL12A': ['ABP1', 'CLN2', 'CLN3', 'CRN1', 'FCY2', 'GCN4', 'GTS1', 'HUA1', 'LOT6', 'PTK2', 'RVS167', 'SLA2', 'SRV2', 'UTP9']}

#Set the minimum number of genes in the solution
minGeneNo = 6

#Find the highly dense subgrpah
Result = Object.Find_Subgraph_Sorted(BiGraph,minGeneNo)

#print Result
print 'Nodes in perturbation set:',Result[0]
print 'Nodes in gene set:',Result[1]

'''