import json
import csv
import sys
import os
import rgi
import re

import argparse
import filepaths

script_path = filepaths.determine_path()
working_directory = os.getcwd()
path = script_path+"/"

def check_delimiter(fastaHeader):
	# Colon
	if ((':').join(re.split(':',fastaHeader)) == fastaHeader) and re.split(':',fastaHeader)[0] != fastaHeader:
		return ":"
	# Pipe
	elif (('|').join(re.split('|',fastaHeader)) == fastaHeader) and re.split('|',fastaHeader)[0] != fastaHeader:
		return "|"
	# Dash
	elif (('-').join(re.split('-',fastaHeader)) == fastaHeader) and re.split('-',fastaHeader)[0] != fastaHeader:
		return "-"
	# Underscore
	elif (('_').join(re.split('_',fastaHeader)) == fastaHeader) and re.split('_',fastaHeader)[0] != fastaHeader:
		return "_"	
	# Other
	else:
		return ""	

#output the information particular field from alignment.Title by splicing it by '|'
def findnthbar(bunchstr, start):

	barc = 0
	over = start+1
	temp = ""

	for eachc in bunchstr:
		if eachc == '|':
			barc += 1
		if barc == start:
			if eachc == '|':
				pass
			else:
				temp += eachc
		if barc == over:
			break		

	return temp

#output the information particular field from alignment.Title by splicing it by '#'
def findnthbar2(bunchstr, n):
	arr = bunchstr.split("#")
	if n < len(arr):
		# gene id
		if n == 1 and arr[n]:
			return int(arr[n])
		elif n == 2:
			return int(arr[n])
		elif n == 3:
			 if int(arr[n]) == 1:
			 	# positive
			 	return "+"
			 else:
			 	# neg
			 	return "-"
		else:
			return arr[n]
	else:
		return ""

def findORFfrom (bunchstr):
	barc = 0
	start = 6
	temp = ""
	allout = False

	for eachc in bunchstr:
		if eachc == '|':
			barc += 1
		if allout or barc == start:
			allout = True
			temp += eachc

	return temp[1:]


def convert(input):
	if isinstance(input, dict):
		return dict((convert(key), convert(value)) for key, value in input.iteritems())
	elif isinstance(input, list):
		return [convert(element) for element in input]
	elif isinstance(input, unicode):
		return input.encode('utf-8')
	else:
		return input


def checkKeyExisted(key, my_dict):
	try:
		nonNone = my_dict[key] is not None
	except KeyError:
		nonNone = False
	return nonNone


def printCSV(resultfile,ofile,orf,verbose):
	if os.path.isfile(resultfile) == False:
		print>>sys.stderr, "convertJsonToTSV missing input JSON file."
		exit()

	try:
		with open(resultfile, 'r') as f:
			data = json.load(f)
		f.close()
	
	except ValueError:
		print>>sys.stderr, "convertJsonToTSV expects a file contains a VALID JSON string."
		exit()

	with open(working_directory+"/"+ofile+".txt", "w") as af:
		writer = csv.writer(af, delimiter='\t', dialect='excel')
		writer.writerow(["ORF_ID", "CONTIG", "START", "STOP", "ORIENTATION", "CUT_OFF", "PASS_EVALUE", "Best_Hit_evalue", "Best_Hit_ARO", "Best_Identities", "ARO", "ARO_name", "Model_type", "SNP", "Best_Hit_ARO_category", "ARO_category", "PASS_bitscore", "Best_Hit_bitscore", "bit_score","Predicted_DNA","Predicted_Protein","CARD_Protein_Sequence","LABEL","ID","Model_id"])
		for item in data:
			minevalue = 0.0
			minscore = 0.0
			maxpercent = 0.0
			startCompare = False
			minARO = ""
			bestAROcategorydict = {}
			AROlist = []
			AROnameList = []
			bitScoreList = []
			AROcatList = []
			snpList = []
			cutoffList = []
			typeList = []
			evalueList = []
			identityList = []
			SequenceFromBroadStreet = ""
			predictedProtein = ""
			predictedDNA = ""
			geneID = ""
			hitID = ""
			topModel = ""
			if item not in ["_metadata","data_type"]:
				geneID = item
				for it in data[item]:
					cgList = []
					if checkKeyExisted("ARO_category", data[item][it]):
						for aroctkey in data[item][it]["ARO_category"]:
							cgList.append(str(data[item][it]["ARO_category"][aroctkey]["category_aro_name"].encode('ascii','replace')))

					if data[item][it]["model_type_id"] == 40293:
						temp = data[item][it]["SNP"]["original"] + str(data[item][it]["SNP"]["position"]) + data[item][it]["SNP"]["change"]
						snpList.append(convert(temp))
					elif data[item][it]["model_type_id"] == 40292:
						snpList.append("n/a")
					"""
					if data[item][it]["model_type_id"] == 41091:
						if checkKeyExisted("SNP",data[item][it]):
							temp = data[item][it]["SNP"]["original"] + str(data[item][it]["SNP"]["position"]) + data[item][it]["SNP"]["change"]
							snpList.append(convert(temp))
						else:
							snpList.append("n/a")
					"""

					AROlist.append(convert(data[item][it]["ARO_accession"]))
					AROnameList.append(convert(data[item][it]["ARO_name"]))
					bitScoreList.append(data[item][it]["bit-score"])
					pass_evalue = str(data[item][it]["pass_evalue"]).split("|")[0]
					pass_bitscore = "n/a"
					AROcatList.append(cgList)
					typeList.append(convert(data[item][it]["model_type"]))
					cutoffList.append(convert(data[item][it]["type_match"]))
					identityList.append(float(data[item][it]["perc_identity"]))

					bestAROcategory = []

					# sort results by minimum e-value and maximum percent identity
					if startCompare:
						if maxscore < data[item][it]["bit-score"] and maxpercent < float(data[item][it]["perc_identity"]):
							minevalue = data[item][it]["evalue"]
							maxscore = data[item][it]["bit-score"]
							maxpercent = float(data[item][it]["perc_identity"])
							minARO = data[item][it]["ARO_name"]
							topModel = data[item][it]["model_id"]
							SequenceFromBroadStreet = data[item][it]["SequenceFromBroadStreet"]

							if "orf_prot_sequence" in data[item][it]:
								predictedProtein = data[item][it]["orf_prot_sequence"]
							if "orf_dna_sequence" in data[item][it]:
								predictedDNA = data[item][it]["orf_dna_sequence"]

							if checkKeyExisted("ARO_category", data[item][it]):
								for key in data[item][it]["ARO_category"]:
									bestAROcategory.append(str(data[item][it]["ARO_category"][key]["category_aro_name"].encode('ascii','replace')))
								bestAROcategorydict[str(minARO)+"|"+str(minevalue)] = bestAROcategory

							if "hsp_num:" in it:
								hitID = it
						

					else:
						startCompare = True
						minevalue = data[item][it]["evalue"]
						maxscore = data[item][it]["bit-score"]
						maxpercent = float(data[item][it]["perc_identity"])
						minARO = data[item][it]["ARO_name"]
						topModel = data[item][it]["model_id"]
						SequenceFromBroadStreet = data[item][it]["SequenceFromBroadStreet"]

						if "orf_prot_sequence" in data[item][it]:
							predictedProtein = data[item][it]["orf_prot_sequence"]
						if "orf_dna_sequence" in data[item][it]:
								predictedDNA = data[item][it]["orf_dna_sequence"]

						if checkKeyExisted("ARO_category", data[item][it]):
							for key in data[item][it]["ARO_category"]:
								bestAROcategory.append(str(data[item][it]["ARO_category"][key]["category_aro_name"].encode('ascii','replace')))
							bestAROcategorydict[str(minARO)+"|"+str(minevalue)] = bestAROcategory

						if "hsp_num:" in it:
							hitID = it

				clist = set(cutoffList)
				tl = set(typeList)
				arocatset = set(AROnameList)
				
				if set(snpList) == set(['n/a']):
					snpList = 'n/a'
				else:
					snpList = ', '.join(snpList)

				from itertools import chain
				AROcatList = list(chain.from_iterable(AROcatList))
				AROcatalphaSet = set(AROcatList)
				AROsortedList = sorted(list(AROcatalphaSet))

				if typeList:
					if orf == "genemark":
						#for protein RGI runs where there's no | or seq_start/stop/strand
						if findnthbar(item, 4) == "":
							writer.writerow([item, 
								"", 
								"", 
								"", 
								"", 
								', '.join(list(clist)),
								pass_evalue,
								minevalue, 
								minARO, 
								maxpercent, 
								', '.join(map(lambda x:"ARO:"+x, AROlist)), 
								'; '.join(list(arocatset)),
								'; '.join(list(tl)), 
								snpList, 
								'; '.join(bestAROcategorydict[str(minARO)+"|"+str(minevalue)]) ,
								'; '.join(AROsortedList),
								pass_bitscore,
								maxscore ,
								', '.join(map(str, bitScoreList)),
								predictedDNA,
								predictedProtein,
								SequenceFromBroadStreet,
								geneID,
								hitID, 
								topModel
								])
		                                else:
						        writer.writerow([findnthbar(item, 0), 
						        	findORFfrom(item), 
						        	int(findnthbar(item, 4))-1, 
						        	int(findnthbar(item, 5))-1, 
						        	findnthbar(item, 3), 
						        	', '.join(list(clist)), 
						        	pass_evalue,
						        	minevalue ,
						        	minARO, 
						        	max(identityList), 
						        	', '.join(map(lambda x:"ARO:"+x, AROlist)), 
						        	'; '.join(list(arocatset)), 
						        	'; '.join(list(tl)), 
						        	snpList, 
						        	'; '.join(bestAROcategorydict[str(minARO)+"|"+str(minevalue)]) ,
						        	'; '.join(AROsortedList), 
						        	pass_bitscore,
						        	maxscore ,
						        	', '.join(map(str, bitScoreList)),
						        	predictedDNA,
						        	predictedProtein,
						        	SequenceFromBroadStreet,
						        	geneID,
						        	hitID,
						        	topModel
						        	])
					else:
						if findnthbar2(item, 1) == "":
							writer.writerow([item, 
								"", 
								"", 
								"", 
								"", 
								', '.join(list(clist)),
								pass_evalue,
								minevalue,
								minARO, 
								maxpercent, 
								', '.join(map(lambda x:"ARO:"+x, AROlist)), 
								'; '.join(list(arocatset)),
								', '.join(list(tl)), 
								snpList, 
								'; '.join(bestAROcategorydict[str(minARO)+"|"+str(minevalue)]),
								'; '.join(AROsortedList), 
								pass_bitscore,
								maxscore,
								', '.join(map(str, bitScoreList)),
								predictedDNA,
								predictedProtein,
								SequenceFromBroadStreet,
								geneID,
								hitID,
								topModel
								])
						else:
							writer.writerow([findnthbar2(item, 0),
								findnthbar2(item, 4).strip(" "), 
					        	int(findnthbar2(item, 1))-1, 
					        	int(findnthbar2(item, 2))-1, 
					        	findnthbar2(item, 3), 
					        	', '.join(list(clist)), 
					        	pass_evalue, 
					        	minevalue,
					        	minARO, 
					        	maxpercent, 
					        	', '.join(map(lambda x:"ARO:"+x, AROlist)), 
					        	', '.join(list(arocatset)), 
					        	', '.join(list(tl)), 
					        	snpList, 
					        	'; '.join(bestAROcategorydict[str(minARO)+"|"+str(minevalue)]),
					        	'; '.join(AROsortedList), 
					        	pass_bitscore,
					        	maxscore,
					        	', '.join(map(str, bitScoreList)),
					        	predictedDNA,
					        	predictedProtein,
					        	SequenceFromBroadStreet,
					        	geneID,
					        	hitID,
					        	topModel
						        ])


	af.close()

def manual():
	h = {}
	h["ORF_ID"] = "Open Reading Frame identifier (internal to RGI)"
	h["CONTIG"] = "Source Sequence"
	h["START"] = "Start co-ordinate of ORF"
	h["STOP"] = "End co-ordinate of ORF"
	h["ORIENTATION"] = "Strand of ORF"
	h["CUT_OFF"] = "RGI Detection Paradigm"
	h["PASS_EVALUE"] = "STRICT detection model Expectation value cut-off"
	h["Best_Hit_evalue"] = "Expectation value of match to top hit in CARD"
	h["Best_Hit_ARO"] = "ARO term of top hit in CARD"
	h["Best_Identities"] = "Percent identity of match to top hit in CARD"
	h["ARO"] = "ARO accession of top hit in CARD"
	h["ARO_name"] = "ARO term of top hit in CARD"
	h["Model_type"] = "CARD detection model type"
	h["SNP"] = "Observed mutation (if applicable)"
	h["Best_Hit_ARO_category"] = "top hit ARO Categorization"
	h["ARO_category"] = "ARO Categorization"
	h["PASS_bitscore"] = "STRICT detection model bitscore value cut-off"
	h["Best_Hit_bitscore"] = "Bit score of match to top hit in CARD"
	h["bit_score"] = "Bitscore of match to top hit in CARD"
	h["Predicted_DNA"] = "ORF predicted nucleotide sequence"
	h["Predicted_Protein"] = "ORF predicted protein sequence"
	h["CARD_Protein_Sequence"] = "Protein sequence of top hit in CARD"
	h["LABEL"] = "ORF label (internal to RGI)"
	h["ID"] = "HSP identifier (internal to RGI)"
	h["Model_id"] = "CARD detection model id"

	print "\n"
	print "COLUMN","\t\t\t","HELP_MESSAGE"
	for i in h:
		print i,"\t\t\t",h[i]	
	print "\n"


class customAction(argparse.Action):
    def __call__(self, parser, namespace, values, option_string=None):
        manual()
        exit()

def main(args):
	afile = args.afile
	ofile = args.output
	#orf = args.orf.lower()
	orf = "prodigal"
	verbose = args.verbose.lower()

	# Check if file is compressed
	if afile.endswith('.gz'):
		afile = rgi.decompress(afile,'gz',working_directory)

	if os.path.isfile(afile):	
		printCSV(afile,ofile,orf,verbose)
	else:
		print "Missing file: ",afile 
	rgi.removeTemp()

def run():
	parser = argparse.ArgumentParser(description='Convert RGI JSON file to Tab-delimited file')
	parser.add_argument('-i','--afile',help='must be a json file generated from RGI in JSON or gzip format e.g out.json, out.json.gz')	
	parser.add_argument('-o', '--out_file',  dest="output", default="dataSummary", help="Output Tab-delimited file (default=dataSummary)")
	parser.add_argument('-v', '--verbose', dest="verbose", default="OFF", help = "include help menu. Options are OFF or ON  (default = OFF for no help)")
	parser.add_argument('--headers', dest="headers", action=customAction,nargs=0,  help = "print tab-delimted help. Options are OFF or ON  (default = OFF for no help)")
	args = parser.parse_args()
	main(args)	

if __name__ == '__main__':
	run()