Tailing_Pipeline_v13.py 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802
  1. #!/usr/local/bin/python3
  2. # coding=utf-8
  3. #Created by Parth Patel, DBI @ University of Delaware, Newark, Delaware 19717
  4. #Date created: 10/08/2014 Modified on 11/2/2015 Line number 350,297
  5. import os
  6. import sys
  7. if sys.version < '2.6':
  8. print ('You are using a version of Python that this program does not support. Please update to the latest version!')
  9. sys.exit(1)
  10. import subprocess, multiprocessing
  11. from multiprocessing import Process, Queue, Pool
  12. import logging
  13. import re #pattern search https://docs.python.org/2/howto/regex.html
  14. #import Bio #importing biopython
  15. #from Bio import SeqIO #import Bio-python
  16. #plot specific packages
  17. import matplotlib
  18. matplotlib.use('Agg')
  19. import matplotlib.pyplot as plt
  20. #from matplotlib import rc
  21. #from pylab import *
  22. import datetime,time,timeit #import Date and Time
  23. import argparse
  24. from PyPDF2 import PdfFileReader, PdfFileMerger #working with PDF in python
  25. import random
  26. ###################################################################################################################
  27. # STEP-1 #
  28. #Given a tag, finds the longest subsequence in it which aligns to a reference genome (corresponding BOWTIE INDEX) #
  29. #Given a file containing short sequences, finds the longest aligned subsequence from the 5' end #
  30. ###################################################################################################################
  31. nproc = 'Y'
  32. nthread = 6
  33. global DIR_NAME
  34. Date=str(datetime.date.today())#get Today's date
  35. Time=str(datetime.datetime.now().time())#get Current time )#get Current time
  36. DIR_NAME="5pLCS_results"+"_"+Date+"_"+Time
  37. global OUTPUT_DIR
  38. #OUTPUT_DIR_NAME="Tailing_output_"+Date+"_"+Time
  39. def RunBowtie(InputFile, BOWTIE_INDEX, Aligned, Unaligned, AlignmentOutput, pas):
  40. command = "bowtie --concise " +\
  41. "-v 0 " +\
  42. "--threads %s " % nthread +\
  43. "%s " % BOWTIE_INDEX +\
  44. "-r %s " % InputFile +\
  45. "%s " % AlignmentOutput +\
  46. "--al %s " % Aligned +\
  47. "--un %s " % Unaligned
  48. if os.system(command) == 0:
  49. print ("Bowtie Completed Successfully - Pass %d " % pas)
  50. def pLCS_finder(inputfile):
  51. print ("Working on "+inputfile+"\n")
  52. InputFile=inputfile
  53. Date=str(datetime.date.today())#get Today's date
  54. OutputFile=os.path.splitext(inputfile)[0]+"_"+Date+".txt" #get the input filename without the extension and append Today's date
  55. bowtie_index=bowtieindex
  56. # print (bowtie_index)
  57. #open output file for writing
  58. IN= open(InputFile,'r')
  59. # Input validation
  60. if not os.path.isfile(InputFile):
  61. print ("Cannot find specified input file")
  62. sys.exit(1)
  63. if not os.path.isfile(bowtie_index+'.1.ebwt'):
  64. print ("Cannot find specified bowtie index")
  65. sys.exit(1)
  66. abundances={}
  67. tail_length={}
  68. fh_in= IN.readline() ## Remove header
  69. for line in IN:
  70. #print(line)
  71. fields = line.split("\t")
  72. seq, value = fields[0], int(fields[1])
  73. abundances[seq] = value
  74. LogFile = os.path.splitext(OutputFile)[0] + '.size_distribution'
  75. tempInput = 'tempInput' +InputFile+ '.txt'
  76. AlignmentOutput = 'Bowtie' +InputFile+ '.out'
  77. aligned = 'aligned_tags' +InputFile+ '.out'
  78. unaligned = 'unaligned_tags' +InputFile+ '.out'
  79. pas = 0
  80. logging.basicConfig(filename=LogFile, level=logging.NOTSET, format="%(message)s")
  81. next_run = open(tempInput, 'w')
  82. next_run.write("\n".join(tag for tag in abundances.keys())+'\n')
  83. next_run.close()
  84. print ("Abundance is %d long and tail length is %d long" %(len(abundances), len(tail_length)))
  85. #
  86. # ###############################################################################################
  87. while len(abundances) != len(tail_length):
  88. # print ("Abundances")
  89. RunBowtie(tempInput, bowtie_index, aligned,unaligned, AlignmentOutput, pas)
  90. # count = 0
  91. list_of_aligned = {}
  92. try:
  93. for tag in open(aligned):
  94. list_of_aligned[tag[:-1]] = 1 # Create an entry for each tag that is aligned. The value bears no significance
  95. except IOError:
  96. pass
  97. # print ("\n\n\n\n\n\n\n\n\n\nDone Listing")
  98. # exit(0)
  99. for tag in abundances.keys():
  100. if tag not in tail_length:
  101. check_tag = tag
  102. if pas < 0:
  103. check_tag = tag[:pas] # For the special case of pas = 0, the intended slicing to trim would not work. For every other pass, this statement trims.
  104. if check_tag in list_of_aligned:
  105. tail_length[tag] = pas
  106. # print ("\n\n\n\n\n\n\n\n\nfound %d tails until this pass" % len(tail_length))
  107. #
  108. pas = int(pas)-1
  109. next_run = open(tempInput, 'w')
  110. unal = open(unaligned)
  111. next_run.write("\n".join(tag[:-2] for tag in unal.readlines())+'\n')
  112. unal.close()
  113. next_run.close()
  114. # print("\n\n\n\n\n\n\n\n\nClosed")
  115. #############################################################################################
  116. command = "rm %s %s %s %s" %(tempInput, AlignmentOutput, aligned, unaligned)
  117. if os.system(command) == 0:
  118. print ("Deleting temporary Files.........")
  119. taglength_count = {}
  120. headlength_count = {}
  121. taglength_sum = {}
  122. headlength_sum = {}
  123. root_of_head_length_sum = {}
  124. # Date=str(datetime.date.today())#get Today's date
  125. # Time=str(datetime.datetime.now().time())#get Current time )#get Current time
  126. # dirName="5pLCS_results"+"_"+Date+"_"+Time
  127. dirName=DIR_NAME
  128. if not os.path.exists(dirName): #create a directory for storing the results of STEP1
  129. os.makedirs(dirName)
  130. current_path= os.getcwd() #get the path of current directory
  131. new_path= os.path.join(current_path,dirName) #add newly created results' folder to an exisiting path
  132. os.chdir(new_path) #change the directory
  133. Output=open(OutputFile, 'w')
  134. Output.write("tag\tabundance\thead\thead_abundance\ttail\n")
  135. for tag in abundances.keys():
  136. Output.write("%s\t" % tag)
  137. Output.write("%d\t" % abundances[tag])
  138. if tail_length[tag] != 0:
  139. Output.write("%s\t" % tag[:tail_length[tag]] )
  140. head_abundance = abundances.get(tag[:tail_length[tag]], 0 )
  141. if head_abundance == 0:
  142. Output.write("0\t")
  143. else:
  144. Output.write("%d\t" % head_abundance)
  145. Output.write("%s\n" % tag[len(tag)+tail_length[tag]:] )
  146. else:
  147. Output.write("%s\t" % tag)
  148. Output.write("%d\t" % abundances[tag])
  149. Output.write("None\n")
  150. # The three try/except clauses have to be maintained separately and as they are:
  151. try:
  152. taglength_count[ len(tag) ] = taglength_count[len(tag) ] + 1
  153. taglength_sum[ len(tag) ] = taglength_sum[ len(tag) ] + abundances[tag]
  154. except KeyError:
  155. taglength_count[ len(tag)] = 1
  156. taglength_sum[ len(tag) ] = abundances[tag]
  157. try:
  158. headlength_count[ len(tag) + tail_length[tag] ] = headlength_count[ len(tag) + tail_length[tag] ] + 1
  159. except:
  160. headlength_count[ len(tag) + tail_length[tag] ] = 1
  161. try:
  162. headlength_sum[ len(tag) + tail_length[tag] ] = headlength_sum[ len(tag) + tail_length[tag] ] + abundances.get(tag[:tail_length[tag]], 0 )
  163. except:
  164. headlength_sum[ len(tag) + tail_length[tag] ] = abundances.get(tag[:tail_length[tag]], 0 )
  165. try:
  166. root_of_head_length_sum[ len(tag) ] = root_of_head_length_sum[ len(tag) ] + abundances.get(tag[:tail_length[tag]], 0 )
  167. except KeyError:
  168. root_of_head_length_sum[ len(tag) ] = abundances.get(tag[:tail_length[tag]], 0 )
  169. Output.close()
  170. logging.info( "(Size-Group)\t(Number of tags)\t(Sum of tags)\t(Number of heads)\t(Sum of heads)\t(Sum of heads with tag in size)\n" )
  171. for key in range( min( headlength_count.keys() ), max( taglength_count.keys() )+1 ):
  172. logging.info( "%d\t%d\t%d\t%d\t%d\t%d" %( key, taglength_count.get(key,0), taglength_sum.get(key,0), headlength_count.get(key,0), headlength_sum.get(key,0), root_of_head_length_sum.get(key,0) ) )
  173. logging.shutdown()
  174. os.chdir(current_path)#go back to main directory
  175. print("\n\npLCS_finder Done")
  176. return dirName
  177. ####################################################################################
  178. # STEP-2 For multiple libraries from the 1st step, merge 5GMC/Tail from above into one #
  179. ####################################################################################
  180. # USAGE = " python2.6 Merge_5pLCS.py -[Name of the results Directory from first step] -[output_file_name] "
  181. def merge_5plCS(DIR,outfile):
  182. seqdir = DIR #"results"#str(sys.argv[1]) #seqdir directory_with_chrom_seq
  183. outputfile = outfile#"step2_output_HEN1.txt"#str(sys.argv[2]) #output_file_name
  184. #open output file for writing
  185. out= open(outputfile,'w')
  186. #library id starting from 1
  187. lib_id=1
  188. for filename in os.listdir(seqdir): #list all the files in the current directory
  189. # print os.listdir(seqdir)
  190. #for filename in os.listdir(os.getcwd()): #list all the files in the current directory
  191. lib_name= os.path.splitext(filename)[0] #get the filename without the extension
  192. # print lib_name + "\n" + filename
  193. in_file = open(os.path.join(seqdir,filename),'r') # open file for reading
  194. f = in_file.readlines()
  195. firstLine = f.pop(0) #removes the first line or the header row
  196. for line in f:
  197. out.write(str(lib_id))
  198. out.write("\t")
  199. out.write(lib_name)
  200. out.write("\t")
  201. out.write(line)
  202. out.write("\n")
  203. in_file.close() #closing the inputfile to fetch the next one
  204. lib_id=lib_id+1 #update the library id
  205. # format_Output(input_file)
  206. print ("Second Step is DONE..Files Merged.")
  207. out.close() #closing the output file
  208. return outputfile
  209. ####################################################################################
  210. # STEP-3 For multiple libraries from the 1st step, merge 5GMC/Tail from above into one #
  211. ####################################################################################
  212. # USAGE = " python2.6 Tailing_analysis_DB_v6.py -[Name of the miRNA/siRNA file] -[Merged file from the second step] "
  213. def tailling_Analysis(miRNA_file,mergred_file):
  214. miRNA_file_name = miRNA_file#"new_miRNA_Test1.fa"#str(sys.argv[1]) #seqdir directory_with_chrom_seq
  215. merged_file_name = mergred_file#"step2_output_HEN1.txt"#str(sys.argv[2]) #output_file_name
  216. miRNA_NAME=[]
  217. miRNA_TAG=[]
  218. fh_in = open(miRNA_file_name,'r')
  219. miRNAData = fh_in.read().split('>')
  220. miRList = [] ## Store miRNA NAME and TAG as tuple
  221. for i in miRNAData[1:]:
  222. block = i.split('\n')
  223. ID = block[0].split(' ')[0]
  224. seq = block[1]
  225. miRList.append((ID,seq))
  226. # print (ID,seq)
  227. miRListS = sorted(miRList) #Sorted by miRNA NAME
  228. for i in miRListS:
  229. ID,SEQ=i
  230. miRNA_NAME.append(ID)
  231. miRNA_TAG.append(SEQ)
  232. ##convert all U'S into T's for miRNA_TAG
  233. for index in range(0,len(miRNA_TAG)):
  234. # print miRNA_TAG[index]+"->>"+ re.sub("U","T",str(miRNA_TAG[index]))
  235. miRNA_TAG[index]=re.sub("U","T",str(miRNA_TAG[index]))
  236. miRNA_TAG[index]=miRNA_TAG[index].upper() # create an upper case if miRNAs are not already in uppercase, Added on 11/2/2015
  237. # print (miRNA_TAG[index],miRNA_NAME[index])
  238. print ("miRNA laoding DONE!")
  239. # output_dirName=OUTPUT_DIR_NAME
  240. # if not os.path.exists(output_dirName): #create a directory for storing the results of STEP1
  241. # os.makedirs(output_dirName)
  242. # current_path= os.getcwd() #get the path of current directory
  243. merged_input=open(merged_file_name, 'r') # READ merged file from step 2
  244. # new_path= os.path.join(current_path,output_dirName) #add newly created results' folder to an exisiting path
  245. # os.chdir(new_path) #change the directory
  246. os.chdir(OUTPUT_DIR)
  247. out= open("Tail_truncated_"+miRNA_file_name+merged_file_name,'w')
  248. out.write("Lib_ID \t Lib_NAME \t Tag \t Tag_abundance \t Sub_tag \t Sub_tag_hits \t Sub_tag_abun \t Tail \t Tail length\n") # tail length added by Parth after the 1st review of this paper. 8/5/2015
  249. out2=open("Tail_summary_truncated_"+miRNA_file_name+merged_file_name,'w')
  250. #out2.write("miRNA_NAME \t miRNA_TAG \t miR_abundance \t Lib_ID \t Lib_NAME \t sum_tailed \t percent \n") didn't use percent because it's giving value more than 100% e.g. 466.66
  251. out2.write("miRNA \t miRNA sequence \t Abundance \t library # \t library \t Sum of abundances \t tailing ratio = Sum of abundances/Abundance \n")
  252. hash_miR= {}
  253. miR_abun= {}
  254. hash_count= {}
  255. hash_lib_id= {}
  256. hash_miR_rows=[] #Created this list which acts as a Arrays of Hashes
  257. for lines in merged_input: #reading each line from the file
  258. if not lines.strip(): # To remove empty line in your file (usually the last line)
  259. continue
  260. #print (lines)
  261. lib_id, lib_name, tag ,tag_abun, sub_tag, sub_tag_abun,tail = lines.split('\t',21)
  262. # 1 test1 CCCAGGTCCAGACATAGTAAGGATTGACAGACTGAGATCACTTTCTTGATTC 1 CCCAGGTCCAGAC 0 ATAGTAAGGATTGACAGACTGAGATCACTTTCTTGATTC
  263. # print lib_id, lib_name, tag ,tag_abun, sub_tag, sub_tag_abun,tail
  264. hash_lib_id [lib_id]=lib_name
  265. if (len(tag)>30):
  266. continue
  267. if (len(sub_tag)<10):
  268. continue
  269. #search for truncated miRNA
  270. for index in range(0,len(miRNA_NAME)):
  271. ID=lib_id+"_"+miRNA_NAME[index]
  272. if((re.match(str(miRNA_TAG[index]),sub_tag,re.IGNORECASE)) or (re.match(sub_tag,str(miRNA_TAG[index]),re.IGNORECASE))): # the pattern at the start of the string, returning a match object, or None if no match was found. re.IGNORECASE Added on 11/2/2015 to handle lower case tag count file.
  273. if (ID in hash_miR): #Append matched pattern to an exisiting miRNA entry in the hash(dictionary)
  274. hash_miR[ID].append([lib_id, lib_name, tag ,tag_abun, sub_tag, sub_tag_abun,tail])
  275. else: #if miRNA entry doesn't exist, create an empty entry in the hash and append the pattern
  276. hash_miR[ID] = []
  277. hash_miR[ID].append([lib_id, lib_name, tag ,tag_abun, sub_tag, sub_tag_abun,tail])
  278. hash_count[sub_tag]=+1
  279. if(miRNA_TAG[index]==tag):
  280. miR_abun[ID]=tag_abun
  281. print ("Tail matching Done!\n")
  282. for index in range(0,len(miRNA_NAME)):
  283. sortedKeys=sorted(hash_lib_id.keys())# sort the hash_lib_id on thier keys
  284. for key in sortedKeys:
  285. lib_id=key
  286. ID=lib_id+"_"+miRNA_NAME[index]
  287. # print sorted(miR_abun.keys())
  288. if(ID in miR_abun):
  289. out.write (">"+miRNA_NAME[index]+"_"+miRNA_TAG[index]+"_"+lib_id+"_"+ hash_lib_id[lib_id]+"_"+ miR_abun[ID]+"\n")
  290. else:
  291. out.write (">"+miRNA_NAME[index]+"_"+miRNA_TAG[index]+"_"+lib_id+"_"+ hash_lib_id[lib_id]+"_"+ "_" +"\n")
  292. # continue
  293. sum_tailed=0
  294. tail_ratio=0
  295. if(ID in hash_miR): #Check if we have the miRNA entry in the hash or not
  296. VALUES = hash_miR[ID]
  297. # print VALUES
  298. for hash_miR_rows in VALUES:
  299. # print hash_miR_rows
  300. lib_id, lib_name, tag ,tag_abun, sub_tag,sub_tag_abun,tail=hash_miR_rows
  301. sub_tag_hits=hash_count[sub_tag] # compute sub_tag_hits
  302. tail=tail.strip()# remove "\n" char from string
  303. hash_miR_rows= lib_id, lib_name, tag ,tag_abun, sub_tag,sub_tag_hits,sub_tag_abun,tail,len(tail) # tail length added by Parth after the 1st review of this paper. 8/5/2015
  304. #print (hash_miR_rows)
  305. delimiter='\t';
  306. output = delimiter.join(str(x) for x in hash_miR_rows)
  307. out.write("%s \n" % output) # tail length added by Parth after the 1st review of this paper. 8/5/2015
  308. sum_tailed=sum_tailed+int(tag_abun)
  309. if(ID in miR_abun):
  310. if (int(miR_abun[ID])<1):
  311. # percent = "N\A"
  312. tail_ratio="N\A"
  313. else:
  314. tail_ratio= (float(sum_tailed)/int(miR_abun[ID]))#Calculate the ratio- CORRECT WAY
  315. # tail_ratio= 100*(float(sum_tailed)/int(miR_abun[ID]))#Calculate the ratio -WRONG WAY
  316. # percent=round(tail_ratio,2) #Round number to 2 digits after decimal point
  317. tail_ratio=round(tail_ratio,2) #Round number to 2 digits after decimal point
  318. # out2.write("%s \t %s \t %d \t %s \t %s \t %d \t %f \n" % (miRNA_NAME[index] ,miRNA_TAG[index],int(miR_abun[ID]),lib_id,hash_lib_id[lib_id],sum_tailed,percent))
  319. out2.write("%s \t %s \t %d \t %s \t %s \t %d \t %f \n" % (miRNA_NAME[index] ,miRNA_TAG[index],int(miR_abun[ID]),lib_id,hash_lib_id[lib_id],sum_tailed,tail_ratio))
  320. else:
  321. out2.write("%s \t %s \t %s \t %s \t %s \t %d \t %s \n" % (miRNA_NAME[index] ,miRNA_TAG[index],"0",lib_id,hash_lib_id[lib_id],sum_tailed,"N\A"))
  322. # closing out put file
  323. out.close()
  324. out2.close()
  325. print ("Step 3 is DONE!\n")
  326. return ("Tail_truncated_"+miRNA_file_name+merged_file_name)
  327. ####################################################################################
  328. # STEP- 4 Format output file from the 3rd step #
  329. ####################################################################################
  330. # USAGE = " python2.6 Format_Taling_v2.py -[Name of the outputfile from step3] "
  331. #>ath-miR156a_TGACAGAAGAGAGTGAGCAC_10_hen1_8_660
  332. #10 hen1_8 TGACAGAAGAGAGTGACCACA 1 TGACAGAAGAGAGTGA 216 0 CCACA
  333. #10 hen1_8 TGACAGAAGAGAGTGAGCACTTTT 16 TGACAGAAGAGAGTGAGCAC 726 660 TTTT
  334. def format_Output(input_file):
  335. inputfile = input_file #"Tail_truncated_new_miRNA_Test1.fastep2_output_HEN1.txt"#str(sys.argv[1]) #input_file_name
  336. outputfile="FORTMATTED_STEP4_OUTPUT.txt"
  337. out= open(outputfile,'w')
  338. #open output file for writing
  339. IN = open(inputfile,'r')
  340. #defining hash variable for miRNA
  341. hash_miRNA={}
  342. hash_lib={}
  343. hash_abun={}
  344. miRNA_name={}
  345. lib_id=[]
  346. LINES = IN.readlines()
  347. firstLine = LINES.pop(0) #removes the first line or the header row
  348. print (firstLine)
  349. for lines in LINES: #reading each line from the file
  350. if not lines.strip():# To remove empty line in your file (usually the last line)
  351. continue
  352. if(re.search('\>',lines)):
  353. lines=re.sub('\>',"",lines) # remove '>' from the line
  354. splitLine = re.split('_',lines,maxsplit=3) #split first 3 item with "_" delimiter
  355. name= splitLine[0] #storing miRNA name
  356. miRNA_seq=splitLine[1] #storing miRNA sequence
  357. lib_id = splitLine[2] #storing libary id
  358. # print name,miRNA_seq,lib_id
  359. hash_miRNA[name]=miRNA_seq
  360. continue
  361. # lines="10 hen1_8 TGACAGAAGAGAGTGAGCACTTTT 16 TGACAGAAGAGAGTGAGCAC 726 660 TTTT"
  362. # lines="1 hen1-1-rep1 TCGGACCAGGCTTCACTTTTTT 4 TCGGACCAGGCTTCA 1 46 CTTTTTT"
  363. lib_id, lib_name,sRNA_seq,sRNA_abun,sub_tag,sub_hit,sub_abun,tail,tail_len = lines.split('\t',9) # tail length added by Parth after the 1st review of this paper. 8/5/2015
  364. # OR lib_id,lib_name,sRNA_seq,sRNA_abun,sub_tag,sub_hit,sub_abun,tail = re.split('\t',lines,maxsplit=8)
  365. # print lib_id, lib_name,sRNA_seq,sRNA_abun,sub_tag,sub_hit,sub_abun,tail
  366. # miRNA_name[name]=sRNA_seq,sub_tag,miRNA_seq,tail
  367. if (name in miRNA_name):
  368. miRNA_name[name].append([sRNA_seq,sub_tag,miRNA_seq,tail,tail_len])#Append matched pattern to an exisiting miRNA entry in the hash(dictionary) # tail length added by Parth after the 1st review of this paper. 8/5/2015
  369. else:
  370. miRNA_name[name]=[]
  371. miRNA_name[name].append([sRNA_seq,sub_tag,miRNA_seq,tail,tail_len])#if miRNA entry doesn't exist, create an empty entry in the hash and append the pattern # tail length added by Parth after the 1st review of this paper. 8/5/2015
  372. # print miRNA_name[name]
  373. # print lib_name,sRNA_seq,miRNA_name
  374. hash_lib[lib_name]=lib_name
  375. # print hash_lib [lib_name]
  376. # print hash_lib[lib_name]
  377. hash_abun[lib_name,sRNA_seq]=sRNA_abun
  378. # print hash_abun[lib_name,sRNA_seq]
  379. sortedKeys=sorted(hash_miRNA.keys()) # sort the hash_miRNA on thier keys
  380. for key in sortedKeys:
  381. hash_sRNA={}
  382. # print ">"+key+"_"+hash_miRNA[key]+"\n"
  383. out.write(">"+key+"_"+hash_miRNA[key]+"\n")
  384. # print "Complete_Sequence \t Sub_tag \t miRNA_seq \t Tail"
  385. out.write("Complete_Sequence \t"+" Sub_tag \t"+" miRNA_seq \t"+"Tail \t"+"Tail length")
  386. sortedKeys_1=sorted(hash_lib.keys())# sort the hash_lib on thier keys
  387. for lib in sortedKeys_1:
  388. out.write("\t"+lib)
  389. out.write("\n")
  390. if(key in miRNA_name):
  391. VALUES=miRNA_name[key]
  392. for miRNA_rows in VALUES:
  393. sRNA_seq,sub_tag,miRNA_seq,tail,tail_len=miRNA_rows # tail length added by Parth after the 1st review of this paper. 8/5/2015
  394. delimiter='\t';
  395. output = delimiter.join(str(x) for x in miRNA_rows)
  396. # print output
  397. count=0
  398. for lib_entry in sortedKeys_1:
  399. # count=+1
  400. # print "count ="+str(count)+"\n"
  401. if((lib_entry,sRNA_seq) in hash_abun):
  402. if(int(hash_abun[lib_entry,sRNA_seq])<1):
  403. hash_abun[lib_entry,sRNA_seq]=0
  404. output=output.replace("\n","")
  405. output=output+"\t"+hash_abun[lib_entry,sRNA_seq] #Appending sRNA_seq aundance based on lib_ID
  406. print (output)
  407. else:
  408. output=output.replace("\n","")
  409. output=output+"\t"+"0" #Appending sRNA_seq aundance = 0 based on IF[lib_entry,sRNA_seq] DOES NOT EXIST in hash_sRNA
  410. hash_sRNA[sRNA_seq]=output
  411. for sRNA_keys in (hash_sRNA.keys()):
  412. out.write(hash_sRNA[sRNA_keys]+"\n")
  413. # output=""
  414. out.write(">EOF_NNNNNNNNNN\n")
  415. #closing files
  416. IN.close()
  417. out.close()
  418. print ("Step-4 is DONE! \n")
  419. return (outputfile)
  420. ####################################################################################
  421. # STEP-5 Generate plots from the 4th step #
  422. ####################################################################################
  423. def genrate_Plots(inputfile):
  424. ######################### User input #############################
  425. #input file
  426. input_file = inputfile
  427. #only U tail? (1=yes, 0= no)
  428. OnlyU = 1
  429. #exclude canonical miRNA? (1=yes, 0= no)
  430. exclude_miR = 0
  431. # Define number of Columns and Rows for each figure
  432. Num_Column = 4
  433. Num_Row = 7
  434. Total_lib = 0
  435. # Calculation Range, default 10
  436. Cal_range = 10
  437. # plotting range for truncation and tailing (e.g. miR163 need larger range because it's 24nt)
  438. Plot_range = 9
  439. #use n to initiate drawfigure
  440. n = 1
  441. # Amplification Factor
  442. AF = 1000
  443. # figure size
  444. Figure_size = 5
  445. # Draw figure
  446. def drawfigure(miRname, miRsize, lib_name, miR, miRsum):
  447. fig = plt.figure(figsize=(Figure_size * Num_Column, Figure_size * Num_Row))
  448. # fig.suptitle(r'%s, size %snt'%(miRname,miRsize),color='k',fontsize=18, ha='center')
  449. fig.suptitle(r'%s, size %s-nt'%(miRname,miRsize),x=0.30,y=0.925,color='k',fontsize=18, ha='right') #Added x=0.35 and y=0.925 by Parth to center the suptitle- Link http://stackoverflow.com/questions/8248467/matplotlib-tight-layout-doesnt-take-into-account-figure-suptitle
  450. fig.subplots_adjust(left=0.125, right=1, bottom=0, top=0.9, wspace=0.4, hspace=.5)# add white space between subplots
  451. def set_properties(ax):
  452. for i in range(Plot_range + 1):
  453. ax.plot((Plot_range,-1+i),(Plot_range-i,-1),'#AFC7C7',linewidth=1,zorder=2)#, linestyle= 'dashdot'
  454. ax.plot((Plot_range-i,-1),(Plot_range,-1+i),'#AFC7C7',linewidth=1,zorder=2)#, linestyle= 'dashdot'
  455. ax.set_yscale('linear')
  456. ax.set_xscale('linear')
  457. plt.xlabel('Length of Truncation', fontsize=16) #added by Parth after the 1st review of this paper. 8/5/2015
  458. plt.ylabel('Length of Tailing', fontsize=16) #added by Parth after the 1st review of this paper. 8/5/2015
  459. ax.set_xlim(-1, Plot_range)
  460. ax.set_ylim(-1, Plot_range)
  461. ax.set_xlim(ax.get_xlim()[::-1])
  462. ax.set_xticks(range(-1,Plot_range))
  463. ax.set_yticks(range(-1,Plot_range))
  464. ax.grid(True)
  465. # calculate the propotion of each position in matrix
  466. for i in range (Cal_range):
  467. for j in range (Cal_range):
  468. for lib in range (Num_libs):
  469. if miRsum[lib] >0:
  470. miR[lib][i][j] = float(miR[lib][i][j])/miRsum[lib]*AF
  471. else:
  472. miR[lib][i][j] = 0
  473. # plotting with miR name and library name
  474. ax = [ 0 for x in range(Num_libs)]
  475. for lib in range (Num_libs):
  476. random_color="#"+("%06x"%random.randint(0,16777215)) # Added by Parth to plot each library with different random colors
  477. # random_color = "#%06x" % random.randint(0,0xFFFFFF)
  478. ax[lib] = fig.add_subplot(Num_Row,Num_Column,lib+1)
  479. ax[lib].set_title(r'%s (%d reads)' %(lib_name[lib], miRsum[lib]),color='k',fontsize=14, ha='center')
  480. for i in range (Cal_range):
  481. for j in range (Cal_range):
  482. # ax[lib].scatter(i, j, c='red', s=miR[lib][i][j],linewidth=1,zorder=7)
  483. ax[lib].scatter(i, j, c=random_color, s=miR[lib][i][j],linewidth=1,zorder=7)
  484. set_properties(ax[lib])
  485. # plt.show()
  486. plt.savefig('%s-%s.png' % (miRname,miRsize),bbox_inches='tight') #bbox_inches='tight' is used to reduce left and right margins in matplotlib plot
  487. plt.savefig('%s-%s.pdf' % (miRname,miRsize),bbox_inches='tight')
  488. plt.close()
  489. #save multiple pdf into single pdf.
  490. def mergePDF():
  491. files_dir = os.getcwd() #get the path of current directory
  492. pdf_files = [f for f in os.listdir(files_dir) if f.endswith("pdf")]
  493. merger = PdfFileMerger()
  494. for filename in pdf_files:
  495. merger.append(PdfFileReader(os.path.join(files_dir, filename), "rb"))
  496. merger.write(os.path.join(files_dir, "merged_full.pdf"))
  497. ###################### Main Script ################################
  498. # Open file
  499. f = open (input_file, 'r')
  500. for line in f:
  501. if line.startswith( '>' ):
  502. if n >1:
  503. print (miRsum)
  504. drawfigure (miRname, miRsize, lib_name, miR, miRsum)
  505. head = [(x) for x in line.split('_')]
  506. miRname = head[0][1:]
  507. miRseq = head[1].rstrip('\n')
  508. miRsize = len(miRseq)
  509. print (miRname, miRseq, miRsize,"\n")
  510. n = n + 1
  511. continue
  512. # Find number of libraries. The related data is in the second row for each miRNA section, start with "Complete"
  513. elif line.startswith( 'Complete' ):
  514. data = [(x) for x in line.split('\t')]
  515. #number of libraries
  516. Num_libs = len(data) -5
  517. lib_name = data[5:]
  518. # Define a 10x10 two-dimention array for each miRNA
  519. miR = [[[0 for x in range(Cal_range)] for x in range(Cal_range)] for x in range(Num_libs)]
  520. miRsum = [ 0 for x in range(Num_libs)]
  521. for lib in range (Num_libs):
  522. miR[lib] = [[0 for col in range(Cal_range)] for row in range(Cal_range)]
  523. miRsum[lib] = 0
  524. continue
  525. # Recording truncation and tailing data
  526. else:
  527. data = [(x) for x in line.split('\t')]
  528. Seq = data[0]
  529. Com = data[1]
  530. miRNA = data[2]
  531. Tail_seq = data[3]
  532. # decide if only include U tail
  533. if OnlyU == 1:
  534. if Tail_seq.find( 'A' ) > -1:
  535. continue
  536. if Tail_seq.find( 'G' ) > -1:
  537. continue
  538. if Tail_seq.find( 'C' ) > -1:
  539. continue
  540. # calculate truncation and tailing length
  541. if Tail_seq.startswith( 'None' ):
  542. if len(Seq) >= len(miRNA):
  543. Tail = len(Seq) - len(miRNA)
  544. Truncation = 0
  545. elif len(Seq) < len(miRNA):
  546. Tail = 0;
  547. Truncation = len(miRNA) - len(Seq)
  548. else:
  549. if len(Com) <= len(miRNA):
  550. Tail = len(Tail_seq)
  551. Truncation = len(miRNA) - len (Com)
  552. elif len(Com) > len(miRNA):
  553. Tail = len(Tail_seq) + (len(Com) - len(miRNA))
  554. Truncation = 0
  555. # remove data points that are out of calculation range.
  556. if (Tail >(Cal_range-1) or Truncation > (Cal_range-1)):
  557. continue
  558. # decide if to include cononical miRNA
  559. if exclude_miR == 1:
  560. if (Tail ==0 and Truncation ==0):
  561. continue
  562. # calculate abundnace for each position on truncation and tailing matrix
  563. abun = {}
  564. for lib in range (Num_libs):
  565. abun[lib] = int(data[5+lib])
  566. miRsum[lib] += abun[lib]
  567. miR[lib][Truncation][Tail] += abun[lib]
  568. f.close()
  569. mergePDF()
  570. def PP(module,alist):
  571. print('***********Parallel instance of %s is being executed*********' % (module))
  572. start = time.time()
  573. ##PP is being used for Bowtie mappings - This will avoid overflooding of processes to server
  574. nprocPP = round((nproc/int(nthread))+1) ## 1 added so as to avoid 0 processor being allocated in serial mode
  575. print('\nnprocPP:%s\n' % (nprocPP))
  576. npool = Pool(int(nprocPP))
  577. npool.map(module, alist)
  578. def main(inputFiles,Bowtie_index,miRNA_List):
  579. #inputfiles = ["MP_hen1-1-rep1.txt","MP_hen1-8-rep1.txt"] #str(sys.argv[1:3])#input files
  580. # outputfile = ["hen1_1_rep1_TEST_10_13_2014.txt","hen1_8_rep1_TEST_10_13_2014.txt"]#str(sys.argv[3:5]) #output_file_name
  581. # global bowtieindex
  582. # bowtieindex= "/alldata/Genomic/Arabidopsis/TAIR9/BowtieGenomicIndexes/AT_TAIR9_genome" #str(sys.argv[3])
  583. # for i in inputfiles:
  584. # dirname = pLCS_finder(i)
  585. inputfiles=inputFiles
  586. global bowtieindex
  587. bowtieindex=Bowtie_index
  588. PP(pLCS_finder,inputfiles) ## Added by Atul
  589. #for i in inputfiles:
  590. #pLCS_finder(i)
  591. print ("STEP1 DONE!!")
  592. outfile="step2_output.txt"
  593. DIRNAME=DIR_NAME#"5pLCS_results_2015-08-05_15:21:19.830100"#DIR_NAME #directory name
  594. Second_step_output=merge_5plCS(DIRNAME,outfile)
  595. # miRNA_file="new_miRNA_Test.fa"#str(sys.argv[4])
  596. miRNA_file=miRNA_List
  597. Third_step_output= tailling_Analysis(miRNA_file,Second_step_output)
  598. Fourth_step_output=format_Output(Third_step_output)
  599. genrate_Plots(Fourth_step_output)
  600. print("Tailing Pipeline is SUCCESSFULLY DONE!")
  601. if __name__ == '__main__': #calls the main function
  602. if nproc == 'Y':
  603. nproc = int(multiprocessing.cpu_count()*0.80)
  604. else:
  605. nproc == int(nproc)
  606. #create empty variables
  607. inputFiles=[]
  608. Bowtie_index=""
  609. miRNA_List=""
  610. #adding positional parser for command line argument
  611. parser = argparse.ArgumentParser(description='Getting input for Tailing Analysis')
  612. # parser.add_argument('tag_countfiles',nargs='+',help='load tag_count files')
  613. parser.add_argument('tag_count_PATH',nargs=1, help='Provide path to the tag_count files')
  614. parser.add_argument('bowtie_PATH',nargs=1,help='Provide path to the bowtie index')
  615. parser.add_argument('mirna_FILE',nargs=1,help='Load miRNA file')
  616. parser.add_argument('output_PATH',nargs=1, help='Provide path to the output of Tailing Pipeline')
  617. args=parser.parse_args()
  618. # inputFiles=args.tag_countfiles
  619. INPUT_DIR=str(args.tag_count_PATH[0])
  620. for fn in os.listdir(INPUT_DIR):
  621. if fn.endswith(".txt"):
  622. inputFiles.append(fn)
  623. Bowtie_index=str(args.bowtie_PATH[0])
  624. miRNA_List=str(args.mirna_FILE[0])
  625. # print (inputFiles)
  626. # print (Bowtie_index+"\n")
  627. # print (miRNA_List+"\n")
  628. OUTPUT_DIR=str(args.output_PATH[0])
  629. start_time = time.clock() # note start time of the exceution added by Parth after review of the paper- 8/6/2015
  630. main(inputFiles,Bowtie_index,miRNA_List)
  631. print (time.clock() - start_time, "seconds") # note start time of the exceution added by Parth after review of the paper- 8/6/2015
  632. sys.exit()