123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802 |
- #!/usr/local/bin/python3
- # coding=utf-8
- #Created by Parth Patel, DBI @ University of Delaware, Newark, Delaware 19717
- #Date created: 10/08/2014 Modified on 11/2/2015 Line number 350,297
- import os
- import sys
- if sys.version < '2.6':
- print ('You are using a version of Python that this program does not support. Please update to the latest version!')
- sys.exit(1)
- import subprocess, multiprocessing
- from multiprocessing import Process, Queue, Pool
- import logging
- import re #pattern search https://docs.python.org/2/howto/regex.html
- #import Bio #importing biopython
- #from Bio import SeqIO #import Bio-python
- #plot specific packages
- import matplotlib
- matplotlib.use('Agg')
- import matplotlib.pyplot as plt
- #from matplotlib import rc
- #from pylab import *
- import datetime,time,timeit #import Date and Time
- import argparse
- from PyPDF2 import PdfFileReader, PdfFileMerger #working with PDF in python
- import random
- ###################################################################################################################
- # STEP-1 #
- #Given a tag, finds the longest subsequence in it which aligns to a reference genome (corresponding BOWTIE INDEX) #
- #Given a file containing short sequences, finds the longest aligned subsequence from the 5' end #
- ###################################################################################################################
- nproc = 'Y'
- nthread = 6
- global DIR_NAME
- Date=str(datetime.date.today())#get Today's date
- Time=str(datetime.datetime.now().time())#get Current time )#get Current time
- DIR_NAME="5pLCS_results"+"_"+Date+"_"+Time
- global OUTPUT_DIR
- #OUTPUT_DIR_NAME="Tailing_output_"+Date+"_"+Time
- def RunBowtie(InputFile, BOWTIE_INDEX, Aligned, Unaligned, AlignmentOutput, pas):
- command = "bowtie --concise " +\
- "-v 0 " +\
- "--threads %s " % nthread +\
- "%s " % BOWTIE_INDEX +\
- "-r %s " % InputFile +\
- "%s " % AlignmentOutput +\
- "--al %s " % Aligned +\
- "--un %s " % Unaligned
-
- if os.system(command) == 0:
- print ("Bowtie Completed Successfully - Pass %d " % pas)
-
- def pLCS_finder(inputfile):
- print ("Working on "+inputfile+"\n")
- InputFile=inputfile
- Date=str(datetime.date.today())#get Today's date
- OutputFile=os.path.splitext(inputfile)[0]+"_"+Date+".txt" #get the input filename without the extension and append Today's date
- bowtie_index=bowtieindex
- # print (bowtie_index)
-
- #open output file for writing
- IN= open(InputFile,'r')
- # Input validation
- if not os.path.isfile(InputFile):
- print ("Cannot find specified input file")
- sys.exit(1)
-
- if not os.path.isfile(bowtie_index+'.1.ebwt'):
- print ("Cannot find specified bowtie index")
- sys.exit(1)
-
-
- abundances={}
- tail_length={}
-
- fh_in= IN.readline() ## Remove header
- for line in IN:
- #print(line)
- fields = line.split("\t")
- seq, value = fields[0], int(fields[1])
- abundances[seq] = value
-
- LogFile = os.path.splitext(OutputFile)[0] + '.size_distribution'
- tempInput = 'tempInput' +InputFile+ '.txt'
- AlignmentOutput = 'Bowtie' +InputFile+ '.out'
- aligned = 'aligned_tags' +InputFile+ '.out'
- unaligned = 'unaligned_tags' +InputFile+ '.out'
- pas = 0
-
- logging.basicConfig(filename=LogFile, level=logging.NOTSET, format="%(message)s")
-
- next_run = open(tempInput, 'w')
- next_run.write("\n".join(tag for tag in abundances.keys())+'\n')
- next_run.close()
-
- print ("Abundance is %d long and tail length is %d long" %(len(abundances), len(tail_length)))
- #
- # ###############################################################################################
- while len(abundances) != len(tail_length):
-
- # print ("Abundances")
- RunBowtie(tempInput, bowtie_index, aligned,unaligned, AlignmentOutput, pas)
- # count = 0
- list_of_aligned = {}
- try:
- for tag in open(aligned):
- list_of_aligned[tag[:-1]] = 1 # Create an entry for each tag that is aligned. The value bears no significance
- except IOError:
- pass
-
- # print ("\n\n\n\n\n\n\n\n\n\nDone Listing")
- # exit(0)
-
- for tag in abundances.keys():
- if tag not in tail_length:
- check_tag = tag
- if pas < 0:
- check_tag = tag[:pas] # For the special case of pas = 0, the intended slicing to trim would not work. For every other pass, this statement trims.
- if check_tag in list_of_aligned:
- tail_length[tag] = pas
-
- # print ("\n\n\n\n\n\n\n\n\nfound %d tails until this pass" % len(tail_length))
- #
- pas = int(pas)-1
- next_run = open(tempInput, 'w')
- unal = open(unaligned)
- next_run.write("\n".join(tag[:-2] for tag in unal.readlines())+'\n')
- unal.close()
- next_run.close()
- # print("\n\n\n\n\n\n\n\n\nClosed")
- #############################################################################################
-
-
- command = "rm %s %s %s %s" %(tempInput, AlignmentOutput, aligned, unaligned)
- if os.system(command) == 0:
- print ("Deleting temporary Files.........")
-
- taglength_count = {}
- headlength_count = {}
- taglength_sum = {}
- headlength_sum = {}
- root_of_head_length_sum = {}
-
- # Date=str(datetime.date.today())#get Today's date
- # Time=str(datetime.datetime.now().time())#get Current time )#get Current time
- # dirName="5pLCS_results"+"_"+Date+"_"+Time
- dirName=DIR_NAME
- if not os.path.exists(dirName): #create a directory for storing the results of STEP1
- os.makedirs(dirName)
- current_path= os.getcwd() #get the path of current directory
- new_path= os.path.join(current_path,dirName) #add newly created results' folder to an exisiting path
- os.chdir(new_path) #change the directory
-
- Output=open(OutputFile, 'w')
- Output.write("tag\tabundance\thead\thead_abundance\ttail\n")
- for tag in abundances.keys():
- Output.write("%s\t" % tag)
- Output.write("%d\t" % abundances[tag])
- if tail_length[tag] != 0:
- Output.write("%s\t" % tag[:tail_length[tag]] )
- head_abundance = abundances.get(tag[:tail_length[tag]], 0 )
- if head_abundance == 0:
- Output.write("0\t")
- else:
- Output.write("%d\t" % head_abundance)
- Output.write("%s\n" % tag[len(tag)+tail_length[tag]:] )
- else:
- Output.write("%s\t" % tag)
- Output.write("%d\t" % abundances[tag])
- Output.write("None\n")
-
-
-
- # The three try/except clauses have to be maintained separately and as they are:
-
- try:
- taglength_count[ len(tag) ] = taglength_count[len(tag) ] + 1
- taglength_sum[ len(tag) ] = taglength_sum[ len(tag) ] + abundances[tag]
- except KeyError:
- taglength_count[ len(tag)] = 1
- taglength_sum[ len(tag) ] = abundances[tag]
-
- try:
- headlength_count[ len(tag) + tail_length[tag] ] = headlength_count[ len(tag) + tail_length[tag] ] + 1
- except:
- headlength_count[ len(tag) + tail_length[tag] ] = 1
-
- try:
- headlength_sum[ len(tag) + tail_length[tag] ] = headlength_sum[ len(tag) + tail_length[tag] ] + abundances.get(tag[:tail_length[tag]], 0 )
- except:
- headlength_sum[ len(tag) + tail_length[tag] ] = abundances.get(tag[:tail_length[tag]], 0 )
-
- try:
- root_of_head_length_sum[ len(tag) ] = root_of_head_length_sum[ len(tag) ] + abundances.get(tag[:tail_length[tag]], 0 )
- except KeyError:
- root_of_head_length_sum[ len(tag) ] = abundances.get(tag[:tail_length[tag]], 0 )
-
- Output.close()
-
- logging.info( "(Size-Group)\t(Number of tags)\t(Sum of tags)\t(Number of heads)\t(Sum of heads)\t(Sum of heads with tag in size)\n" )
- for key in range( min( headlength_count.keys() ), max( taglength_count.keys() )+1 ):
- logging.info( "%d\t%d\t%d\t%d\t%d\t%d" %( key, taglength_count.get(key,0), taglength_sum.get(key,0), headlength_count.get(key,0), headlength_sum.get(key,0), root_of_head_length_sum.get(key,0) ) )
-
- logging.shutdown()
- os.chdir(current_path)#go back to main directory
- print("\n\npLCS_finder Done")
- return dirName
- ####################################################################################
- # STEP-2 For multiple libraries from the 1st step, merge 5GMC/Tail from above into one #
- ####################################################################################
- # USAGE = " python2.6 Merge_5pLCS.py -[Name of the results Directory from first step] -[output_file_name] "
- def merge_5plCS(DIR,outfile):
- seqdir = DIR #"results"#str(sys.argv[1]) #seqdir directory_with_chrom_seq
- outputfile = outfile#"step2_output_HEN1.txt"#str(sys.argv[2]) #output_file_name
-
- #open output file for writing
- out= open(outputfile,'w')
-
- #library id starting from 1
- lib_id=1
-
-
- for filename in os.listdir(seqdir): #list all the files in the current directory
- # print os.listdir(seqdir)
- #for filename in os.listdir(os.getcwd()): #list all the files in the current directory
- lib_name= os.path.splitext(filename)[0] #get the filename without the extension
- # print lib_name + "\n" + filename
- in_file = open(os.path.join(seqdir,filename),'r') # open file for reading
-
-
- f = in_file.readlines()
- firstLine = f.pop(0) #removes the first line or the header row
- for line in f:
-
- out.write(str(lib_id))
- out.write("\t")
- out.write(lib_name)
- out.write("\t")
- out.write(line)
- out.write("\n")
-
- in_file.close() #closing the inputfile to fetch the next one
- lib_id=lib_id+1 #update the library id
- # format_Output(input_file)
-
-
- print ("Second Step is DONE..Files Merged.")
- out.close() #closing the output file
- return outputfile
- ####################################################################################
- # STEP-3 For multiple libraries from the 1st step, merge 5GMC/Tail from above into one #
- ####################################################################################
- # USAGE = " python2.6 Tailing_analysis_DB_v6.py -[Name of the miRNA/siRNA file] -[Merged file from the second step] "
- def tailling_Analysis(miRNA_file,mergred_file):
- miRNA_file_name = miRNA_file#"new_miRNA_Test1.fa"#str(sys.argv[1]) #seqdir directory_with_chrom_seq
- merged_file_name = mergred_file#"step2_output_HEN1.txt"#str(sys.argv[2]) #output_file_name
-
- miRNA_NAME=[]
- miRNA_TAG=[]
- fh_in = open(miRNA_file_name,'r')
- miRNAData = fh_in.read().split('>')
- miRList = [] ## Store miRNA NAME and TAG as tuple
- for i in miRNAData[1:]:
- block = i.split('\n')
- ID = block[0].split(' ')[0]
- seq = block[1]
- miRList.append((ID,seq))
- # print (ID,seq)
-
- miRListS = sorted(miRList) #Sorted by miRNA NAME
- for i in miRListS:
- ID,SEQ=i
- miRNA_NAME.append(ID)
- miRNA_TAG.append(SEQ)
-
- ##convert all U'S into T's for miRNA_TAG
- for index in range(0,len(miRNA_TAG)):
- # print miRNA_TAG[index]+"->>"+ re.sub("U","T",str(miRNA_TAG[index]))
- miRNA_TAG[index]=re.sub("U","T",str(miRNA_TAG[index]))
- miRNA_TAG[index]=miRNA_TAG[index].upper() # create an upper case if miRNAs are not already in uppercase, Added on 11/2/2015
- # print (miRNA_TAG[index],miRNA_NAME[index])
- print ("miRNA laoding DONE!")
-
-
-
- # output_dirName=OUTPUT_DIR_NAME
- # if not os.path.exists(output_dirName): #create a directory for storing the results of STEP1
- # os.makedirs(output_dirName)
- # current_path= os.getcwd() #get the path of current directory
-
- merged_input=open(merged_file_name, 'r') # READ merged file from step 2
-
- # new_path= os.path.join(current_path,output_dirName) #add newly created results' folder to an exisiting path
- # os.chdir(new_path) #change the directory
- os.chdir(OUTPUT_DIR)
- out= open("Tail_truncated_"+miRNA_file_name+merged_file_name,'w')
- out.write("Lib_ID \t Lib_NAME \t Tag \t Tag_abundance \t Sub_tag \t Sub_tag_hits \t Sub_tag_abun \t Tail \t Tail length\n") # tail length added by Parth after the 1st review of this paper. 8/5/2015
- out2=open("Tail_summary_truncated_"+miRNA_file_name+merged_file_name,'w')
- #out2.write("miRNA_NAME \t miRNA_TAG \t miR_abundance \t Lib_ID \t Lib_NAME \t sum_tailed \t percent \n") didn't use percent because it's giving value more than 100% e.g. 466.66
- out2.write("miRNA \t miRNA sequence \t Abundance \t library # \t library \t Sum of abundances \t tailing ratio = Sum of abundances/Abundance \n")
-
- hash_miR= {}
- miR_abun= {}
- hash_count= {}
- hash_lib_id= {}
- hash_miR_rows=[] #Created this list which acts as a Arrays of Hashes
-
-
- for lines in merged_input: #reading each line from the file
-
-
- if not lines.strip(): # To remove empty line in your file (usually the last line)
- continue
- #print (lines)
- lib_id, lib_name, tag ,tag_abun, sub_tag, sub_tag_abun,tail = lines.split('\t',21)
- # 1 test1 CCCAGGTCCAGACATAGTAAGGATTGACAGACTGAGATCACTTTCTTGATTC 1 CCCAGGTCCAGAC 0 ATAGTAAGGATTGACAGACTGAGATCACTTTCTTGATTC
-
- # print lib_id, lib_name, tag ,tag_abun, sub_tag, sub_tag_abun,tail
-
- hash_lib_id [lib_id]=lib_name
-
- if (len(tag)>30):
- continue
- if (len(sub_tag)<10):
- continue
-
-
- #search for truncated miRNA
- for index in range(0,len(miRNA_NAME)):
- ID=lib_id+"_"+miRNA_NAME[index]
-
-
- if((re.match(str(miRNA_TAG[index]),sub_tag,re.IGNORECASE)) or (re.match(sub_tag,str(miRNA_TAG[index]),re.IGNORECASE))): # the pattern at the start of the string, returning a match object, or None if no match was found. re.IGNORECASE Added on 11/2/2015 to handle lower case tag count file.
-
-
- if (ID in hash_miR): #Append matched pattern to an exisiting miRNA entry in the hash(dictionary)
-
- hash_miR[ID].append([lib_id, lib_name, tag ,tag_abun, sub_tag, sub_tag_abun,tail])
-
- else: #if miRNA entry doesn't exist, create an empty entry in the hash and append the pattern
-
- hash_miR[ID] = []
- hash_miR[ID].append([lib_id, lib_name, tag ,tag_abun, sub_tag, sub_tag_abun,tail])
-
-
- hash_count[sub_tag]=+1
-
-
- if(miRNA_TAG[index]==tag):
- miR_abun[ID]=tag_abun
-
-
- print ("Tail matching Done!\n")
-
-
- for index in range(0,len(miRNA_NAME)):
-
- sortedKeys=sorted(hash_lib_id.keys())# sort the hash_lib_id on thier keys
-
-
- for key in sortedKeys:
- lib_id=key
- ID=lib_id+"_"+miRNA_NAME[index]
- # print sorted(miR_abun.keys())
- if(ID in miR_abun):
- out.write (">"+miRNA_NAME[index]+"_"+miRNA_TAG[index]+"_"+lib_id+"_"+ hash_lib_id[lib_id]+"_"+ miR_abun[ID]+"\n")
- else:
- out.write (">"+miRNA_NAME[index]+"_"+miRNA_TAG[index]+"_"+lib_id+"_"+ hash_lib_id[lib_id]+"_"+ "_" +"\n")
- # continue
-
- sum_tailed=0
- tail_ratio=0
-
- if(ID in hash_miR): #Check if we have the miRNA entry in the hash or not
- VALUES = hash_miR[ID]
- # print VALUES
-
-
-
-
- for hash_miR_rows in VALUES:
- # print hash_miR_rows
- lib_id, lib_name, tag ,tag_abun, sub_tag,sub_tag_abun,tail=hash_miR_rows
-
- sub_tag_hits=hash_count[sub_tag] # compute sub_tag_hits
- tail=tail.strip()# remove "\n" char from string
- hash_miR_rows= lib_id, lib_name, tag ,tag_abun, sub_tag,sub_tag_hits,sub_tag_abun,tail,len(tail) # tail length added by Parth after the 1st review of this paper. 8/5/2015
-
- #print (hash_miR_rows)
-
- delimiter='\t';
- output = delimiter.join(str(x) for x in hash_miR_rows)
- out.write("%s \n" % output) # tail length added by Parth after the 1st review of this paper. 8/5/2015
-
- sum_tailed=sum_tailed+int(tag_abun)
-
- if(ID in miR_abun):
- if (int(miR_abun[ID])<1):
- # percent = "N\A"
- tail_ratio="N\A"
- else:
- tail_ratio= (float(sum_tailed)/int(miR_abun[ID]))#Calculate the ratio- CORRECT WAY
- # tail_ratio= 100*(float(sum_tailed)/int(miR_abun[ID]))#Calculate the ratio -WRONG WAY
- # percent=round(tail_ratio,2) #Round number to 2 digits after decimal point
- tail_ratio=round(tail_ratio,2) #Round number to 2 digits after decimal point
- # out2.write("%s \t %s \t %d \t %s \t %s \t %d \t %f \n" % (miRNA_NAME[index] ,miRNA_TAG[index],int(miR_abun[ID]),lib_id,hash_lib_id[lib_id],sum_tailed,percent))
- out2.write("%s \t %s \t %d \t %s \t %s \t %d \t %f \n" % (miRNA_NAME[index] ,miRNA_TAG[index],int(miR_abun[ID]),lib_id,hash_lib_id[lib_id],sum_tailed,tail_ratio))
- else:
- out2.write("%s \t %s \t %s \t %s \t %s \t %d \t %s \n" % (miRNA_NAME[index] ,miRNA_TAG[index],"0",lib_id,hash_lib_id[lib_id],sum_tailed,"N\A"))
-
- # closing out put file
- out.close()
- out2.close()
-
- print ("Step 3 is DONE!\n")
- return ("Tail_truncated_"+miRNA_file_name+merged_file_name)
- ####################################################################################
- # STEP- 4 Format output file from the 3rd step #
- ####################################################################################
- # USAGE = " python2.6 Format_Taling_v2.py -[Name of the outputfile from step3] "
- #>ath-miR156a_TGACAGAAGAGAGTGAGCAC_10_hen1_8_660
- #10 hen1_8 TGACAGAAGAGAGTGACCACA 1 TGACAGAAGAGAGTGA 216 0 CCACA
- #10 hen1_8 TGACAGAAGAGAGTGAGCACTTTT 16 TGACAGAAGAGAGTGAGCAC 726 660 TTTT
- def format_Output(input_file):
- inputfile = input_file #"Tail_truncated_new_miRNA_Test1.fastep2_output_HEN1.txt"#str(sys.argv[1]) #input_file_name
- outputfile="FORTMATTED_STEP4_OUTPUT.txt"
- out= open(outputfile,'w')
-
- #open output file for writing
- IN = open(inputfile,'r')
-
- #defining hash variable for miRNA
- hash_miRNA={}
- hash_lib={}
- hash_abun={}
-
- miRNA_name={}
-
- lib_id=[]
-
-
- LINES = IN.readlines()
- firstLine = LINES.pop(0) #removes the first line or the header row
- print (firstLine)
- for lines in LINES: #reading each line from the file
-
- if not lines.strip():# To remove empty line in your file (usually the last line)
- continue
-
- if(re.search('\>',lines)):
-
- lines=re.sub('\>',"",lines) # remove '>' from the line
- splitLine = re.split('_',lines,maxsplit=3) #split first 3 item with "_" delimiter
- name= splitLine[0] #storing miRNA name
- miRNA_seq=splitLine[1] #storing miRNA sequence
- lib_id = splitLine[2] #storing libary id
- # print name,miRNA_seq,lib_id
- hash_miRNA[name]=miRNA_seq
- continue
-
- # lines="10 hen1_8 TGACAGAAGAGAGTGAGCACTTTT 16 TGACAGAAGAGAGTGAGCAC 726 660 TTTT"
- # lines="1 hen1-1-rep1 TCGGACCAGGCTTCACTTTTTT 4 TCGGACCAGGCTTCA 1 46 CTTTTTT"
-
- lib_id, lib_name,sRNA_seq,sRNA_abun,sub_tag,sub_hit,sub_abun,tail,tail_len = lines.split('\t',9) # tail length added by Parth after the 1st review of this paper. 8/5/2015
- # OR lib_id,lib_name,sRNA_seq,sRNA_abun,sub_tag,sub_hit,sub_abun,tail = re.split('\t',lines,maxsplit=8)
- # print lib_id, lib_name,sRNA_seq,sRNA_abun,sub_tag,sub_hit,sub_abun,tail
-
- # miRNA_name[name]=sRNA_seq,sub_tag,miRNA_seq,tail
- if (name in miRNA_name):
- miRNA_name[name].append([sRNA_seq,sub_tag,miRNA_seq,tail,tail_len])#Append matched pattern to an exisiting miRNA entry in the hash(dictionary) # tail length added by Parth after the 1st review of this paper. 8/5/2015
- else:
- miRNA_name[name]=[]
- miRNA_name[name].append([sRNA_seq,sub_tag,miRNA_seq,tail,tail_len])#if miRNA entry doesn't exist, create an empty entry in the hash and append the pattern # tail length added by Parth after the 1st review of this paper. 8/5/2015
- # print miRNA_name[name]
- # print lib_name,sRNA_seq,miRNA_name
- hash_lib[lib_name]=lib_name
- # print hash_lib [lib_name]
- # print hash_lib[lib_name]
- hash_abun[lib_name,sRNA_seq]=sRNA_abun
- # print hash_abun[lib_name,sRNA_seq]
-
-
-
- sortedKeys=sorted(hash_miRNA.keys()) # sort the hash_miRNA on thier keys
- for key in sortedKeys:
-
- hash_sRNA={}
-
- # print ">"+key+"_"+hash_miRNA[key]+"\n"
- out.write(">"+key+"_"+hash_miRNA[key]+"\n")
- # print "Complete_Sequence \t Sub_tag \t miRNA_seq \t Tail"
- out.write("Complete_Sequence \t"+" Sub_tag \t"+" miRNA_seq \t"+"Tail \t"+"Tail length")
- sortedKeys_1=sorted(hash_lib.keys())# sort the hash_lib on thier keys
- for lib in sortedKeys_1:
- out.write("\t"+lib)
- out.write("\n")
-
-
-
-
- if(key in miRNA_name):
-
- VALUES=miRNA_name[key]
- for miRNA_rows in VALUES:
- sRNA_seq,sub_tag,miRNA_seq,tail,tail_len=miRNA_rows # tail length added by Parth after the 1st review of this paper. 8/5/2015
-
- delimiter='\t';
- output = delimiter.join(str(x) for x in miRNA_rows)
- # print output
- count=0
- for lib_entry in sortedKeys_1:
- # count=+1
- # print "count ="+str(count)+"\n"
- if((lib_entry,sRNA_seq) in hash_abun):
- if(int(hash_abun[lib_entry,sRNA_seq])<1):
- hash_abun[lib_entry,sRNA_seq]=0
- output=output.replace("\n","")
- output=output+"\t"+hash_abun[lib_entry,sRNA_seq] #Appending sRNA_seq aundance based on lib_ID
- print (output)
- else:
- output=output.replace("\n","")
- output=output+"\t"+"0" #Appending sRNA_seq aundance = 0 based on IF[lib_entry,sRNA_seq] DOES NOT EXIST in hash_sRNA
-
- hash_sRNA[sRNA_seq]=output
-
-
- for sRNA_keys in (hash_sRNA.keys()):
- out.write(hash_sRNA[sRNA_keys]+"\n")
- # output=""
-
- out.write(">EOF_NNNNNNNNNN\n")
- #closing files
- IN.close()
- out.close()
- print ("Step-4 is DONE! \n")
- return (outputfile)
- ####################################################################################
- # STEP-5 Generate plots from the 4th step #
- ####################################################################################
- def genrate_Plots(inputfile):
- ######################### User input #############################
- #input file
- input_file = inputfile
- #only U tail? (1=yes, 0= no)
- OnlyU = 1
- #exclude canonical miRNA? (1=yes, 0= no)
- exclude_miR = 0
- # Define number of Columns and Rows for each figure
- Num_Column = 4
- Num_Row = 7
- Total_lib = 0
- # Calculation Range, default 10
- Cal_range = 10
- # plotting range for truncation and tailing (e.g. miR163 need larger range because it's 24nt)
- Plot_range = 9
- #use n to initiate drawfigure
- n = 1
- # Amplification Factor
- AF = 1000
- # figure size
- Figure_size = 5
-
- # Draw figure
- def drawfigure(miRname, miRsize, lib_name, miR, miRsum):
-
- fig = plt.figure(figsize=(Figure_size * Num_Column, Figure_size * Num_Row))
- # fig.suptitle(r'%s, size %snt'%(miRname,miRsize),color='k',fontsize=18, ha='center')
- fig.suptitle(r'%s, size %s-nt'%(miRname,miRsize),x=0.30,y=0.925,color='k',fontsize=18, ha='right') #Added x=0.35 and y=0.925 by Parth to center the suptitle- Link http://stackoverflow.com/questions/8248467/matplotlib-tight-layout-doesnt-take-into-account-figure-suptitle
- fig.subplots_adjust(left=0.125, right=1, bottom=0, top=0.9, wspace=0.4, hspace=.5)# add white space between subplots
-
- def set_properties(ax):
- for i in range(Plot_range + 1):
- ax.plot((Plot_range,-1+i),(Plot_range-i,-1),'#AFC7C7',linewidth=1,zorder=2)#, linestyle= 'dashdot'
- ax.plot((Plot_range-i,-1),(Plot_range,-1+i),'#AFC7C7',linewidth=1,zorder=2)#, linestyle= 'dashdot'
- ax.set_yscale('linear')
- ax.set_xscale('linear')
- plt.xlabel('Length of Truncation', fontsize=16) #added by Parth after the 1st review of this paper. 8/5/2015
- plt.ylabel('Length of Tailing', fontsize=16) #added by Parth after the 1st review of this paper. 8/5/2015
- ax.set_xlim(-1, Plot_range)
- ax.set_ylim(-1, Plot_range)
- ax.set_xlim(ax.get_xlim()[::-1])
- ax.set_xticks(range(-1,Plot_range))
- ax.set_yticks(range(-1,Plot_range))
- ax.grid(True)
- # calculate the propotion of each position in matrix
- for i in range (Cal_range):
- for j in range (Cal_range):
- for lib in range (Num_libs):
- if miRsum[lib] >0:
- miR[lib][i][j] = float(miR[lib][i][j])/miRsum[lib]*AF
- else:
- miR[lib][i][j] = 0
- # plotting with miR name and library name
- ax = [ 0 for x in range(Num_libs)]
- for lib in range (Num_libs):
- random_color="#"+("%06x"%random.randint(0,16777215)) # Added by Parth to plot each library with different random colors
- # random_color = "#%06x" % random.randint(0,0xFFFFFF)
- ax[lib] = fig.add_subplot(Num_Row,Num_Column,lib+1)
- ax[lib].set_title(r'%s (%d reads)' %(lib_name[lib], miRsum[lib]),color='k',fontsize=14, ha='center')
- for i in range (Cal_range):
- for j in range (Cal_range):
- # ax[lib].scatter(i, j, c='red', s=miR[lib][i][j],linewidth=1,zorder=7)
- ax[lib].scatter(i, j, c=random_color, s=miR[lib][i][j],linewidth=1,zorder=7)
- set_properties(ax[lib])
-
- # plt.show()
- plt.savefig('%s-%s.png' % (miRname,miRsize),bbox_inches='tight') #bbox_inches='tight' is used to reduce left and right margins in matplotlib plot
- plt.savefig('%s-%s.pdf' % (miRname,miRsize),bbox_inches='tight')
- plt.close()
-
-
-
-
-
- #save multiple pdf into single pdf.
- def mergePDF():
-
- files_dir = os.getcwd() #get the path of current directory
- pdf_files = [f for f in os.listdir(files_dir) if f.endswith("pdf")]
- merger = PdfFileMerger()
- for filename in pdf_files:
- merger.append(PdfFileReader(os.path.join(files_dir, filename), "rb"))
- merger.write(os.path.join(files_dir, "merged_full.pdf"))
-
-
- ###################### Main Script ################################
-
- # Open file
- f = open (input_file, 'r')
- for line in f:
- if line.startswith( '>' ):
- if n >1:
- print (miRsum)
- drawfigure (miRname, miRsize, lib_name, miR, miRsum)
- head = [(x) for x in line.split('_')]
- miRname = head[0][1:]
- miRseq = head[1].rstrip('\n')
- miRsize = len(miRseq)
- print (miRname, miRseq, miRsize,"\n")
- n = n + 1
- continue
- # Find number of libraries. The related data is in the second row for each miRNA section, start with "Complete"
- elif line.startswith( 'Complete' ):
- data = [(x) for x in line.split('\t')]
- #number of libraries
- Num_libs = len(data) -5
- lib_name = data[5:]
- # Define a 10x10 two-dimention array for each miRNA
- miR = [[[0 for x in range(Cal_range)] for x in range(Cal_range)] for x in range(Num_libs)]
- miRsum = [ 0 for x in range(Num_libs)]
- for lib in range (Num_libs):
- miR[lib] = [[0 for col in range(Cal_range)] for row in range(Cal_range)]
- miRsum[lib] = 0
- continue
- # Recording truncation and tailing data
- else:
- data = [(x) for x in line.split('\t')]
- Seq = data[0]
- Com = data[1]
- miRNA = data[2]
- Tail_seq = data[3]
- # decide if only include U tail
- if OnlyU == 1:
- if Tail_seq.find( 'A' ) > -1:
- continue
- if Tail_seq.find( 'G' ) > -1:
- continue
- if Tail_seq.find( 'C' ) > -1:
- continue
- # calculate truncation and tailing length
- if Tail_seq.startswith( 'None' ):
- if len(Seq) >= len(miRNA):
- Tail = len(Seq) - len(miRNA)
- Truncation = 0
- elif len(Seq) < len(miRNA):
- Tail = 0;
- Truncation = len(miRNA) - len(Seq)
- else:
- if len(Com) <= len(miRNA):
- Tail = len(Tail_seq)
- Truncation = len(miRNA) - len (Com)
- elif len(Com) > len(miRNA):
- Tail = len(Tail_seq) + (len(Com) - len(miRNA))
- Truncation = 0
- # remove data points that are out of calculation range.
- if (Tail >(Cal_range-1) or Truncation > (Cal_range-1)):
- continue
- # decide if to include cononical miRNA
- if exclude_miR == 1:
- if (Tail ==0 and Truncation ==0):
- continue
- # calculate abundnace for each position on truncation and tailing matrix
- abun = {}
- for lib in range (Num_libs):
- abun[lib] = int(data[5+lib])
- miRsum[lib] += abun[lib]
- miR[lib][Truncation][Tail] += abun[lib]
- f.close()
- mergePDF()
- def PP(module,alist):
- print('***********Parallel instance of %s is being executed*********' % (module))
-
- start = time.time()
- ##PP is being used for Bowtie mappings - This will avoid overflooding of processes to server
- nprocPP = round((nproc/int(nthread))+1) ## 1 added so as to avoid 0 processor being allocated in serial mode
- print('\nnprocPP:%s\n' % (nprocPP))
- npool = Pool(int(nprocPP))
- npool.map(module, alist)
- def main(inputFiles,Bowtie_index,miRNA_List):
-
- #inputfiles = ["MP_hen1-1-rep1.txt","MP_hen1-8-rep1.txt"] #str(sys.argv[1:3])#input files
- # outputfile = ["hen1_1_rep1_TEST_10_13_2014.txt","hen1_8_rep1_TEST_10_13_2014.txt"]#str(sys.argv[3:5]) #output_file_name
- # global bowtieindex
- # bowtieindex= "/alldata/Genomic/Arabidopsis/TAIR9/BowtieGenomicIndexes/AT_TAIR9_genome" #str(sys.argv[3])
-
- # for i in inputfiles:
- # dirname = pLCS_finder(i)
-
- inputfiles=inputFiles
- global bowtieindex
- bowtieindex=Bowtie_index
- PP(pLCS_finder,inputfiles) ## Added by Atul
- #for i in inputfiles:
- #pLCS_finder(i)
- print ("STEP1 DONE!!")
- outfile="step2_output.txt"
- DIRNAME=DIR_NAME#"5pLCS_results_2015-08-05_15:21:19.830100"#DIR_NAME #directory name
- Second_step_output=merge_5plCS(DIRNAME,outfile)
- # miRNA_file="new_miRNA_Test.fa"#str(sys.argv[4])
- miRNA_file=miRNA_List
- Third_step_output= tailling_Analysis(miRNA_file,Second_step_output)
- Fourth_step_output=format_Output(Third_step_output)
- genrate_Plots(Fourth_step_output)
- print("Tailing Pipeline is SUCCESSFULLY DONE!")
-
- if __name__ == '__main__': #calls the main function
- if nproc == 'Y':
- nproc = int(multiprocessing.cpu_count()*0.80)
- else:
- nproc == int(nproc)
-
- #create empty variables
- inputFiles=[]
- Bowtie_index=""
- miRNA_List=""
-
- #adding positional parser for command line argument
- parser = argparse.ArgumentParser(description='Getting input for Tailing Analysis')
- # parser.add_argument('tag_countfiles',nargs='+',help='load tag_count files')
- parser.add_argument('tag_count_PATH',nargs=1, help='Provide path to the tag_count files')
- parser.add_argument('bowtie_PATH',nargs=1,help='Provide path to the bowtie index')
- parser.add_argument('mirna_FILE',nargs=1,help='Load miRNA file')
- parser.add_argument('output_PATH',nargs=1, help='Provide path to the output of Tailing Pipeline')
- args=parser.parse_args()
- # inputFiles=args.tag_countfiles
-
- INPUT_DIR=str(args.tag_count_PATH[0])
- for fn in os.listdir(INPUT_DIR):
- if fn.endswith(".txt"):
- inputFiles.append(fn)
- Bowtie_index=str(args.bowtie_PATH[0])
- miRNA_List=str(args.mirna_FILE[0])
- # print (inputFiles)
- # print (Bowtie_index+"\n")
- # print (miRNA_List+"\n")
- OUTPUT_DIR=str(args.output_PATH[0])
- start_time = time.clock() # note start time of the exceution added by Parth after review of the paper- 8/6/2015
- main(inputFiles,Bowtie_index,miRNA_List)
- print (time.clock() - start_time, "seconds") # note start time of the exceution added by Parth after review of the paper- 8/6/2015
- sys.exit()
|