LiuFan
/
PrivacyScanData


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
							#import sys
import argparse

parser = argparse.ArgumentParser( prog='find_unique_end',description="find the either 5' or 3' unique seqs", epilog='python find_unique_end.py -i inputfile -o outputfile')
parser.add_argument ('-i','--input',help='Input File Name', default="./data/mouse_VH.txt")
#parser.add_argument('-5','--head', help="trim 5' region", action='store_true')
#parser.add_argument('-o','--output',help='Output File Name',required="true")
parser.add_argument('-3','--direction', help="trim 3' region", action='store_false', default=True)
parser.add_argument('-n','--primerlen', type=int, help='length of nucleotide', default='22')
parser.print_help()
args=parser.parse_args()

##show values##
#print (parser.print_help())
print ("input file: %s" % args.input)
#print ("output file: %s" % args.output)
print ("start with 5prime?")
print (args.direction)
print ("primer length: %s" % args.primerlen)

def truncate(oligo,is5prime,length):
	if is5prime == True:
		#print oligo
		return oligo[:length-1]
	else:
		return oligo[length+1:]

####parse the input file, generate key value pair of each sequences
def read_dnaFastas(fastaFile):
	Infile1 = open(fastaFile, 'r')
	germName=''
	germSeq=''
	allDict={}
	nucleotides=set('ATGCatgcNn')
	for line in Infile1:
		line = line.strip().strip('\n')
		if line and line.startswith(">"):
			line=line.lstrip('>')
			#process previous entry
			allDict[germName]= germSeq
			#begin a new entry
			germName=line
			germSeq=''
		elif line =='':  #/check whether it is the last line
				allDict[germName]= germSeq
		elif  set(line) <= nucleotides and len(line)>10: 
			germSeq = germSeq + line
	Infile1.close()
	return allDict;

########## prepare output files ###############################
#Infilename1 = args.input
#Infile1 = open(Infilename1, 'r')
#Outfilename1= args.output
	
Outfilename1=args.input.rstrip(".txt")+"_fulllength_noN"+str(args.primerlen)+".txt"
Outfile1 = open(Outfilename1, 'w')

if args.direction == True:
	Outfilename2=args.input.rstrip('.txt')+"Sense_primerlen_"+str(args.primerlen)
	Outfilename3=args.input.rstrip('.txt')+"Unique_5prime_lenth_"+str(args.primerlen)
else:
	Outfilename2=args.input.rstrip('.txt')+"AntiSense_primerlen_"+str(args.primerlen)
	Outfilename3=args.input('.txt')+"Unique_3prime_lenth_"+str(args.primerlen)
Outfile2 = open(Outfilename2, 'w')
Outfile3 = open(Outfilename3, 'w')

############# main ###############
allFasta={}
allFasta = read_dnaFastas(args.input)
trunFasta={}
unique_trunFasta={}
unique_Oligo=[]
all_Oligo=[]

for key in allFasta:
	Outfile1.write('>'+str(key) + "\t" + allFasta[key] + '\n')
	if not allFasta[key].upper().startswith('N'):
		
		currentOligo= truncate(allFasta[key],args.direction,args.primerlen) 
		# generate trimmed fasta file and dictinary
		all_Oligo.append(currentOligo)
		trunFasta[key] = currentOligo 
		Outfile2.write(str(key) + "\t" + trunFasta[key] + '\n')
print (str(len(allFasta))+' sequences have been writen into ' + Outfilename1 + '\n')
print (str(len(trunFasta)) + ' truncated sequences have been writen into ' + Outfilename2 + '\n')

unique_Oligo=set(all_Oligo)

for seq in unique_Oligo:
	count=0
	for key in trunFasta:
		if trunFasta[key] == seq:
			count = count +1
		unique_trunFasta[seq]= count;
	Outfile3.write(">"+str(count) +'_counts_'+ seq+"\n" + seq + '\n')


print (str(len(unique_Oligo)) +" unique oligos whose lenth is "+str(args.primerlen)+ 'have been writen into ' + Outfilename3 + '\n')
Outfile1.close()
Outfile2.close()
Outfile3.close()