find_unique_end4.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
  1. #import sys
  2. import argparse
  3. parser = argparse.ArgumentParser( prog='find_unique_end',description="find the either 5' or 3' unique seqs", epilog='python find_unique_end.py -i inputfile -o outputfile')
  4. parser.add_argument ('-i','--input',help='Input File Name', default="./data/mouse_VH.txt")
  5. #parser.add_argument('-5','--head', help="trim 5' region", action='store_true')
  6. #parser.add_argument('-o','--output',help='Output File Name',required="true")
  7. parser.add_argument('-3','--direction', help="trim 3' region", action='store_false', default=True)
  8. parser.add_argument('-n','--primerlen', type=int, help='length of nucleotide', default='22')
  9. parser.print_help()
  10. args=parser.parse_args()
  11. ##show values##
  12. #print (parser.print_help())
  13. print ("input file: %s" % args.input)
  14. #print ("output file: %s" % args.output)
  15. print ("start with 5prime?")
  16. print (args.direction)
  17. print ("primer length: %s" % args.primerlen)
  18. def truncate(oligo,is5prime,length):
  19. if is5prime == True:
  20. #print oligo
  21. return oligo[:length-1]
  22. else:
  23. return oligo[length+1:]
  24. ####parse the input file, generate key value pair of each sequences
  25. def read_dnaFastas(fastaFile):
  26. Infile1 = open(fastaFile, 'r')
  27. germName=''
  28. germSeq=''
  29. allDict={}
  30. nucleotides=set('ATGCatgcNn')
  31. for line in Infile1:
  32. line = line.strip().strip('\n')
  33. if line and line.startswith(">"):
  34. line=line.lstrip('>')
  35. #process previous entry
  36. allDict[germName]= germSeq
  37. #begin a new entry
  38. germName=line
  39. germSeq=''
  40. elif line =='': #/check whether it is the last line
  41. allDict[germName]= germSeq
  42. elif set(line) <= nucleotides and len(line)>10:
  43. germSeq = germSeq + line
  44. Infile1.close()
  45. return allDict;
  46. ########## prepare output files ###############################
  47. #Infilename1 = args.input
  48. #Infile1 = open(Infilename1, 'r')
  49. #Outfilename1= args.output
  50. Outfilename1=args.input.rstrip(".txt")+"_fulllength_noN"+str(args.primerlen)+".txt"
  51. Outfile1 = open(Outfilename1, 'w')
  52. if args.direction == True:
  53. Outfilename2=args.input.rstrip('.txt')+"Sense_primerlen_"+str(args.primerlen)
  54. Outfilename3=args.input.rstrip('.txt')+"Unique_5prime_lenth_"+str(args.primerlen)
  55. else:
  56. Outfilename2=args.input.rstrip('.txt')+"AntiSense_primerlen_"+str(args.primerlen)
  57. Outfilename3=args.input('.txt')+"Unique_3prime_lenth_"+str(args.primerlen)
  58. Outfile2 = open(Outfilename2, 'w')
  59. Outfile3 = open(Outfilename3, 'w')
  60. ############# main ###############
  61. allFasta={}
  62. allFasta = read_dnaFastas(args.input)
  63. trunFasta={}
  64. unique_trunFasta={}
  65. unique_Oligo=[]
  66. all_Oligo=[]
  67. for key in allFasta:
  68. Outfile1.write('>'+str(key) + "\t" + allFasta[key] + '\n')
  69. if not allFasta[key].upper().startswith('N'):
  70. currentOligo= truncate(allFasta[key],args.direction,args.primerlen)
  71. # generate trimmed fasta file and dictinary
  72. all_Oligo.append(currentOligo)
  73. trunFasta[key] = currentOligo
  74. Outfile2.write(str(key) + "\t" + trunFasta[key] + '\n')
  75. print (str(len(allFasta))+' sequences have been writen into ' + Outfilename1 + '\n')
  76. print (str(len(trunFasta)) + ' truncated sequences have been writen into ' + Outfilename2 + '\n')
  77. unique_Oligo=set(all_Oligo)
  78. for seq in unique_Oligo:
  79. count=0
  80. for key in trunFasta:
  81. if trunFasta[key] == seq:
  82. count = count +1
  83. unique_trunFasta[seq]= count;
  84. Outfile3.write(">"+str(count) +'_counts_'+ seq+"\n" + seq + '\n')
  85. print (str(len(unique_Oligo)) +" unique oligos whose lenth is "+str(args.primerlen)+ 'have been writen into ' + Outfilename3 + '\n')
  86. Outfile1.close()
  87. Outfile2.close()
  88. Outfile3.close()