123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102 |
- #import sys
- import argparse
- parser = argparse.ArgumentParser( prog='find_unique_end',description="find the either 5' or 3' unique seqs", epilog='python find_unique_end.py -i inputfile -o outputfile')
- parser.add_argument ('-i','--input',help='Input File Name', default="./data/mouse_VH.txt")
- #parser.add_argument('-5','--head', help="trim 5' region", action='store_true')
- #parser.add_argument('-o','--output',help='Output File Name',required="true")
- parser.add_argument('-3','--direction', help="trim 3' region", action='store_false', default=True)
- parser.add_argument('-n','--primerlen', type=int, help='length of nucleotide', default='22')
- parser.print_help()
- args=parser.parse_args()
- ##show values##
- #print (parser.print_help())
- print ("input file: %s" % args.input)
- #print ("output file: %s" % args.output)
- print ("start with 5prime?")
- print (args.direction)
- print ("primer length: %s" % args.primerlen)
- def truncate(oligo,is5prime,length):
- if is5prime == True:
- #print oligo
- return oligo[:length-1]
- else:
- return oligo[length+1:]
- ####parse the input file, generate key value pair of each sequences
- def read_dnaFastas(fastaFile):
- Infile1 = open(fastaFile, 'r')
- germName=''
- germSeq=''
- allDict={}
- nucleotides=set('ATGCatgcNn')
- for line in Infile1:
- line = line.strip().strip('\n')
- if line and line.startswith(">"):
- line=line.lstrip('>')
- #process previous entry
- allDict[germName]= germSeq
- #begin a new entry
- germName=line
- germSeq=''
- elif line =='': #/check whether it is the last line
- allDict[germName]= germSeq
- elif set(line) <= nucleotides and len(line)>10:
- germSeq = germSeq + line
- Infile1.close()
- return allDict;
- ########## prepare output files ###############################
- #Infilename1 = args.input
- #Infile1 = open(Infilename1, 'r')
- #Outfilename1= args.output
-
- Outfilename1=args.input.rstrip(".txt")+"_fulllength_noN"+str(args.primerlen)+".txt"
- Outfile1 = open(Outfilename1, 'w')
- if args.direction == True:
- Outfilename2=args.input.rstrip('.txt')+"Sense_primerlen_"+str(args.primerlen)
- Outfilename3=args.input.rstrip('.txt')+"Unique_5prime_lenth_"+str(args.primerlen)
- else:
- Outfilename2=args.input.rstrip('.txt')+"AntiSense_primerlen_"+str(args.primerlen)
- Outfilename3=args.input('.txt')+"Unique_3prime_lenth_"+str(args.primerlen)
- Outfile2 = open(Outfilename2, 'w')
- Outfile3 = open(Outfilename3, 'w')
- ############# main ###############
- allFasta={}
- allFasta = read_dnaFastas(args.input)
- trunFasta={}
- unique_trunFasta={}
- unique_Oligo=[]
- all_Oligo=[]
- for key in allFasta:
- Outfile1.write('>'+str(key) + "\t" + allFasta[key] + '\n')
- if not allFasta[key].upper().startswith('N'):
-
- currentOligo= truncate(allFasta[key],args.direction,args.primerlen)
- # generate trimmed fasta file and dictinary
- all_Oligo.append(currentOligo)
- trunFasta[key] = currentOligo
- Outfile2.write(str(key) + "\t" + trunFasta[key] + '\n')
- print (str(len(allFasta))+' sequences have been writen into ' + Outfilename1 + '\n')
- print (str(len(trunFasta)) + ' truncated sequences have been writen into ' + Outfilename2 + '\n')
- unique_Oligo=set(all_Oligo)
- for seq in unique_Oligo:
- count=0
- for key in trunFasta:
- if trunFasta[key] == seq:
- count = count +1
- unique_trunFasta[seq]= count;
- Outfile3.write(">"+str(count) +'_counts_'+ seq+"\n" + seq + '\n')
- print (str(len(unique_Oligo)) +" unique oligos whose lenth is "+str(args.primerlen)+ 'have been writen into ' + Outfilename3 + '\n')
- Outfile1.close()
- Outfile2.close()
- Outfile3.close()
|