本文整理汇总了Python中utils.unParseTable函数的典型用法代码示例。如果您正苦于以下问题:Python unParseTable函数的具体用法?Python unParseTable怎么用?Python unParseTable使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了unParseTable函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: makeFoldTable
def makeFoldTable(annotFile,analysisName,testName,controlName,testMMR,controlMMR,testIdxFile,controlIdxFile,outputFolder,epsilon = 1):
'''
makes the fold table and writes to disk
fold table is ranked by fold change
first column is guideID, second column is gene name, third is fold change
'''
guideDict,geneDict = makeAnnotDict(annotFile)
testIdx = utils.parseTable(testIdxFile,'\t')
controlIdx = utils.parseTable(controlIdxFile,'\t')
#for each guide, divide the count by the MMR then add 1 then take the log2 ratio
outTable = [['GUIDE_ID','GENE','LOG2_RATIO',testName,controlName]]
for i in range(len(testIdx)):
guideID = testIdx[i][0]
gene = guideDict[guideID]
testCount = float(testIdx[i][2])/testMMR + epsilon
controlCount = float(controlIdx[i][2])/controlMMR + epsilon
log2Ratio = numpy.log2(testCount/controlCount)
newLine = [guideID,gene,log2Ratio,round(testCount,4),round(controlCount,4)]
outTable.append(newLine)
outputFile = '%s%s_log2Ratio.txt' % (outputFolder,analysisName)
utils.unParseTable(outTable,outputFile,'\t')
return outputFile
开发者ID:BoulderLabs,项目名称:pipeline,代码行数:33,代码来源:processGeckoBam.py
示例2: makeEnhancerSignalTable
def makeEnhancerSignalTable(mergedRegionMap,medianDict,analysisName,genome,outputFolder):
'''
makes a table where each row is an enhancer and each column is the log2
background corrected signal vs. median
'''
#load in the region map
regionMap = utils.parseTable(mergedRegionMap,'\t')
namesList = medianDict.keys()
signalTable = [['REGION_ID','CHROM','START','STOP','NUM_LOCI','CONSTITUENT_SIZE'] + namesList]
for line in regionMap[1:]:
newLine = line[0:6]
for i in range(len(namesList)):
enhancerIndex = (i*2) + 6
controlIndex = (i*2) + 7
enhancerSignal = float(line[enhancerIndex]) - float(line[controlIndex])
if enhancerSignal < 0:
enhancerSignal = 0
enhancerSignal = enhancerSignal/medianDict[namesList[i]]
newLine.append(enhancerSignal)
signalTable.append(newLine)
outputFile = "%s%s_%s_signalTable.txt" % (outputFolder,genome,analysisName)
print "WRITING MEDIAN NORMALIZED SIGNAL TABLE TO %s" % (outputFile)
utils.unParseTable(signalTable,outputFile,'\t')
return outputFile
开发者ID:zhouhufeng,项目名称:pipeline,代码行数:29,代码来源:clusterEnhancer.py
示例3: filterSubpeaks
def filterSubpeaks(subpeakFile,gene_to_enhancer_dict, analysis_name,output_folder):
'''
takes the initial subpeaks in, stitches them,
'''
# stitch the subpeaks
print(subpeakFile)
subpeakCollection = utils.importBoundRegion(subpeakFile,'%s_subpeak' % (analysis_name))
subpeakCollection = subpeakCollection.stitchCollection()
subpeakLoci = subpeakCollection.getLoci()
all_sub_bed = []
for locus in subpeakLoci:
bed_line = [locus.chr(),locus.start(),locus.end(),'.',locus.ID()]
all_sub_bed.append(bed_line)
all_bed_path = output_folder + analysis_name + '_all_subpeak.bed'
utils.unParseTable(all_sub_bed, all_bed_path, '\t')
return all_bed_path
开发者ID:linlabcode,项目名称:pipeline,代码行数:25,代码来源:CRC3.py
示例4: makeEnhancerSignalTable
def makeEnhancerSignalTable(nameDict,mergedRegionMap,medianDict,analysisName,genome,outputFolder):
'''
makes a table where each row is an enhancer and each column is the log2
background corrected signal vs. median
'''
#load in the region map
regionMap = utils.parseTable(mergedRegionMap,'\t')
namesList = nameDict.keys()
namesList.sort()
signalTable = [['REGION_ID','CHROM','START','STOP','NUM_LOCI','CONSTITUENT_SIZE'] + namesList]
print("len of %s for namesList" % (len(namesList)))
print(namesList)
for line in regionMap[1:]:
newLine = line[0:6]
#a little tricky here to add datasets sequentially
i = 6 #start w/ the first column w/ data
for name in namesList:
if nameDict[name]['background'] == True:
enhancerIndex = int(i)
i +=1
controlIndex = int(i)
i +=1
try:
enhancerSignal = float(line[enhancerIndex]) - float(line[controlIndex])
except IndexError:
print line
print len(line)
print enhancerIndex
print controlIndex
sys.exit()
else:
enhancerIndex = int(i)
i+=1
enhancerSignal = float(line[enhancerIndex])
if enhancerSignal < 0:
enhancerSignal = 0
enhancerSignal = enhancerSignal/medianDict[name]
newLine.append(enhancerSignal)
signalTable.append(newLine)
outputFile = "%s%s_%s_signalTable.txt" % (outputFolder,genome,analysisName)
print "WRITING MEDIAN NORMALIZED SIGNAL TABLE TO %s" % (outputFile)
utils.unParseTable(signalTable,outputFile,'\t')
return outputFile
开发者ID:BoulderLabs,项目名称:pipeline,代码行数:57,代码来源:clusterEnhancer.py
示例5: assignEnhancerRank
def assignEnhancerRank(enhancerToGeneFile,enhancerFile1,enhancerFile2,name1,name2,rankOutput=''):
'''
for all genes in the enhancerToGene Table, assigns the highest overlapping ranked enhancer in the other tables
'''
enhancerToGene = utils.parseTable(enhancerToGeneFile,'\t')
enhancerCollection1 = makeSECollection(enhancerFile1,name1,False)
enhancerCollection2 = makeSECollection(enhancerFile2,name2,False)
enhancerDict1 = makeSEDict(enhancerFile1,name1,False)
enhancerDict2 = makeSEDict(enhancerFile2,name2,False)
#we're going to update the enhancerToGeneTable
enhancerToGene[0] += ['%s_rank' % name1,'%s_rank' % name2]
for i in range(1,len(enhancerToGene)):
line = enhancerToGene[i]
locusLine = utils.Locus(line[1],line[2],line[3],'.',line[0])
#if the enhancer doesn't exist, its ranking is dead last on the enhancer list
enhancer1Overlap = enhancerCollection1.getOverlap(locusLine,'both')
if len(enhancer1Overlap) == 0:
enhancer1Rank = len(enhancerCollection1)
else:
rankList1 = [enhancerDict1[x.ID()]['rank'] for x in enhancer1Overlap]
enhancer1Rank = min(rankList1)
enhancer2Overlap = enhancerCollection2.getOverlap(locusLine,'both')
if len(enhancer2Overlap) == 0:
enhancer2Rank = len(enhancerCollection2)
else:
rankList2 = [enhancerDict2[x.ID()]['rank'] for x in enhancer2Overlap]
enhancer2Rank = min(rankList2)
enhancerToGene[i]+=[enhancer1Rank,enhancer2Rank]
if len(rankOutput) == 0:
return enhancerToGene
else:
utils.unParseTable(enhancerToGene,rankOutput,'\t')
开发者ID:zhouhufeng,项目名称:pipeline,代码行数:50,代码来源:dynamicEnhancer.py
示例6: mergeCollections
def mergeCollections(nameDict,analysisName,output='',superOnly=True):
'''
merges them collections
'''
allLoci = []
namesList = nameDict.keys()
for name in namesList:
seCollection =makeSECollection(nameDict[name]['enhancerFile'],name,superOnly)
if superOnly:
print "DATASET: %s HAS %s SUPERENHANCERS" % (name,len(seCollection))
else:
print "DATASET: %s HAS %s ENHANCERS" % (name,len(seCollection))
allLoci += seCollection.getLoci()
print len(allLoci)
mergedCollection = utils.LocusCollection(allLoci,50)
#stitch the collection together
stitchedCollection = mergedCollection.stitchCollection()
stitchedLoci = stitchedCollection.getLoci()
print "IDENTIFIED %s CONSENSUS ENHANCER REGIONS" % (len(stitchedLoci))
#sort by size and provide a unique ID
sizeList = [locus.len() for locus in stitchedLoci]
sizeOrder = utils.order(sizeList,decreasing=True)
orderedLoci = [stitchedLoci[i] for i in sizeOrder]
for i in range(len(orderedLoci)):
orderedLoci[i]._ID = 'merged_%s_%s' % (analysisName,str(i+1))
mergedGFF = []
for locus in orderedLoci:
newLine = [locus.chr(),locus.ID(),'',locus.start(),locus.end(),'',locus.sense(),'',locus.ID()]
mergedGFF.append(newLine)
if len(output) == 0:
return mergedGFF
else:
print "writing merged gff to %s" % (output)
utils.unParseTable(mergedGFF,output,'\t')
return output
开发者ID:BoulderLabs,项目名称:pipeline,代码行数:50,代码来源:clusterEnhancer.py
示例7: getTonyInfo
def getTonyInfo(uniqueIDList,colList):
'''
pass this a uniqueID List and a list of columns
'''
uniqueIDString = string.join(uniqueIDList,',')
columnString = string.join([str(x) for x in colList],',')
cmd = "perl /ark/tony/admin/getDB_Data.pl -i %s -c %s -o TAB" % (uniqueIDString,columnString)
sqlOut = subprocess.Popen(cmd,stdin = subprocess.PIPE,stderr = subprocess.PIPE,stdout = subprocess.PIPE,shell = True)
sqlText = sqlOut.communicate()
sqlText = sqlText[0]
sqlTable = sqlText.split('\n')
sqlTable = [x for x in sqlTable if len(x) > 0]
sqlTable = [x.split('\t') for x in sqlTable]
header = [x.split(':')[-1] for x in sqlTable[0][1:]]
header= [str.upper(x) for x in header]
header = ['GENOME', 'SOURCE', 'CELL_TYPE', 'NAME', 'BAMFILE']
tonyDict = {}
for line in sqlTable[1:]:
uniqueID = line[0]
tonyDict[uniqueID] = {}
for i in range(len(header)):
tonyDict[uniqueID][header[i]] = line[(i+1)]
newTable = []
newTable.append(header)
for key in tonyDict.keys():
newLine = []
newLine.append(str.upper(tonyDict[key]['GENOME']))
newLine.append(tonyDict[key]['SOURCE'])
newLine.append(tonyDict[key]['CELL_TYPE'])
newLine.append(tonyDict[key]['NAME'])
newLine.append(tonyDict[key]['BAMFILE'])
newTable.append(newLine)
#print newTable
utils.unParseTable(newTable, '/grail/projects/masterBamTable.txt', '\t')
开发者ID:BoulderLabs,项目名称:pipeline,代码行数:48,代码来源:bamTableUpdate.py
示例8: findValleys
def findValleys(gene_to_enhancer_dict, bamFileList, projectName, projectFolder, cutoff = 0.2):
'''
takes in the super dict
returns a dictionary of refseqs with all valley loci that are associated
returns 2 kinds of bed files...
1 = all
'''
#first make the bamDict
all_valley_bed = []
valleyDict = {}
#start w/ a bamFileList and make a list of bam type objects
bam_list = [utils.Bam(bam_path) for bam_path in bamFileList]
max_read_length = max([bam.getReadLengths()[0] for bam in bam_list])
gene_list = gene_to_enhancer_dict.keys()
gene_list.sort()
ticker = 0
print('number of regions processed:')
for gene in gene_list:
valleyDict[gene] = []
for region in gene_to_enhancer_dict[gene]:
if ticker %100 == 0:
print(ticker)
ticker+=1
scoreArray = scoreValley(region, bam_list,max_read_length,projectName, projectFolder)
for index,score in enumerate(scoreArray):
if score > cutoff:
valley = utils.Locus(region.chr(), region.start() + index*10,
region.start() + (index+1)*10, '.')
valleyDict[gene].append(valley)
stitchedValleys = stitchValleys(valleyDict[gene])
for valley in stitchedValleys:
all_valley_bed.append([valley.chr(), valley.start(), valley.end()])
valleyDict[gene] = stitchedValleys
all_bed_path = projectFolder + projectName + '_all_valleys.bed'
utils.unParseTable(all_valley_bed, all_bed_path, '\t')
return all_bed_path
开发者ID:linlabcode,项目名称:pipeline,代码行数:48,代码来源:CRC3.py
示例9: mapGFFLineToBed
def mapGFFLineToBed(gffLine, outFolder, nBins, bedCollection, header=''):
'''
for every line produces a file with all of the rectangles to draw
'''
if len(header) == 0:
gffString = '%s_%s_%s_%s' % (gffLine[0], gffLine[6], gffLine[3], gffLine[4])
else:
gffString = header
diagramTable = [[0, 0, 0, 0]]
nameTable = [['', 0, 0]]
gffLocus = utils.Locus(gffLine[0], int(gffLine[3]), int(gffLine[4]), gffLine[6], gffLine[1])
scaleFactor = float(nBins) / gffLocus.len()
# plotting buffer for diagrams
# plotBuffer = int(gffLocus.len() / float(nBins) * 20) # UNUSED (?)
overlapLoci = bedCollection.getOverlap(gffLocus, sense='both')
print("IDENTIFIED %s OVERLAPPING BED LOCI FOR REGION %s" % (len(overlapLoci),gffLine))
# since beds come from multiple sources, we want to figure out how to offset them
offsetDict = {} # this will store each ID name
bedNamesList = utils.uniquify([locus.ID() for locus in overlapLoci])
bedNamesList.sort()
for i in range(len(bedNamesList)):
offsetDict[bedNamesList[i]] = 2 * i # offsets different categories of bed regions
if gffLine[6] == '-':
refPoint = int(gffLine[4])
else:
refPoint = int(gffLine[3])
# fill out the name table
for name in bedNamesList:
offset = offsetDict[name]
nameTable.append([name, 0, 0.0 - offset])
for bedLocus in overlapLoci:
offset = offsetDict[bedLocus.ID()]
[start, stop] = [abs(x - refPoint) * scaleFactor for x in bedLocus.coords()]
diagramTable.append([start, -0.5 - offset, stop, 0.5 - offset])
utils.unParseTable(diagramTable, outFolder + gffString + '_bedDiagramTemp.txt', '\t')
utils.unParseTable(nameTable, outFolder + gffString + '_bedNameTemp.txt', '\t')
开发者ID:linlabcode,项目名称:pipeline,代码行数:47,代码来源:bamPlot_turbo.py
示例10: buildGraph
def buildGraph(edgeDict,gene_to_enhancer_dict,output_folder, analysis_name,cutoff=1):
'''
from the collapsed edge dictionary, build a target graph
require at least n motifs to constitute an edge where n is set by cutoff.
default is 1
'''
node_list = edgeDict.keys()
node_list.sort()
#this is only edges between TFs
graph = nx.DiGraph(name=analysis_name)
graph.add_nodes_from(node_list)
#this stores ALL edges identified by motifs
edge_table = [['SOURCE','TARGET','CHROM','START','STOP','REGION_ID','TF_INTERACTION']]
edge_output = '%s%s_EDGE_TABLE.txt' % (output_folder,analysis_name)
for source in node_list:
print(source)
target_list = edgeDict[source].keys()
target_list.sort()
for target in target_list:
#now we need to see which target regions this guy overlaps
target_regions = gene_to_enhancer_dict[target]
target_collection = utils.LocusCollection(target_regions,50)
#get the edges hitting that target
edgeLoci = edgeDict[source][target]
if node_list.count(target) > 0:
tf_interaction = 1
else:
tf_interaction = 0
#only add to the graph if this is a TF/TF interaction
if len(edgeLoci) >= cutoff and node_list.count(target) > 0:
graph.add_edge(source,target)
#now for each edge, add to the table
for edgeLocus in edgeLoci:
regionString = ','.join([locus.ID() for locus in target_collection.getOverlap(edgeLocus)])
edgeLine = [source,target,edgeLocus.chr(),edgeLocus.start(),edgeLocus.end(),regionString,tf_interaction]
edge_table.append(edgeLine)
utils.unParseTable(edge_table,edge_output,'\t')
return graph
开发者ID:linlabcode,项目名称:pipeline,代码行数:46,代码来源:CRC3.py
示例11: mergeCollections
def mergeCollections(superFile1,superFile2,name1,name2,output=''):
'''
merges them collections
'''
conSuperCollection = makeSECollection(superFile1,name1)
tnfSuperCollection = makeSECollection(superFile2,name2)
#now merge them
mergedLoci = conSuperCollection.getLoci() + tnfSuperCollection.getLoci()
mergedCollection = utils.LocusCollection(mergedLoci,50)
#stitch the collection together
stitchedCollection = mergedCollection.stitchCollection()
stitchedLoci = stitchedCollection.getLoci()
#loci that are in both get renamed with a new unique identifier
renamedLoci =[]
ticker = 1
for locus in stitchedLoci:
if len(conSuperCollection.getOverlap(locus)) > 0 and len(tnfSuperCollection.getOverlap(locus)):
newID = 'CONSERVED_%s' % (str(ticker))
ticker +=1
locus._ID = newID
else:
locus._ID = locus.ID()[2:]
renamedLoci.append(locus)
#now we turn this into a gff and write it out
gff = utils.locusCollectionToGFF(utils.LocusCollection(renamedLoci,50))
if len(output) == 0:
return gff
else:
print "writing merged gff to %s" % (output)
utils.unParseTable(gff,output,'\t')
return output
开发者ID:zhouhufeng,项目名称:pipeline,代码行数:45,代码来源:dynamicEnhancer.py
示例12: makeRigerTable
def makeRigerTable(foldTableFile,output=''):
'''
blah
'''
#need a table of this format
rigerTable = [['Construct','GeneSymbol','NormalizedScore','Construct Rank','HairpinWeight']]
#set weight to 1 for now
foldTable = utils.parseTable(foldTableFile,'\t')
constructOrder = utils.order([float(line[2]) for line in foldTable[1:]],decreasing=True)
#make geneCountDict
print("making gene count dictionary")
geneCountDict= defaultdict(int)
for line in foldTable[1:]:
geneCountDict[line[1]] +=1
print("iterating through constructs")
constructRank = 1
for i in constructOrder:
rowIndex = i+1 # accounts for the header
geneName = foldTable[rowIndex][1]
if geneCountDict[geneName] == 1:
print("Gene %s only has one guide RNA. Excluding from FRIGER analysis" % (geneName))
continue
newLine = foldTable[rowIndex][0:3] + [constructRank,1]
rigerTable.append(newLine)
constructRank += 1
if len(output) == 0:
output = string.replace(foldTableFile,'_log2Ratio.txt','_friger.txt')
utils.unParseTable(rigerTable,output,'\t')
return output
开发者ID:BoulderLabs,项目名称:pipeline,代码行数:39,代码来源:processGeckoBam.py
示例13: collapseRegionMap
def collapseRegionMap(regionMapFile,name='',controlBams=False):
'''
takes a regionMap file and collapses signal into a single column
also fixes any stupid start/stop sorting issues
needs to take into account whether or not controls were used
'''
regionMap = utils.parseTable(regionMapFile,'\t')
for n,line in enumerate(regionMap):
if n ==0:
#new header
if len(name) == 0:
name = 'MERGED_SIGNAL'
regionMap[n] = line[0:6] +[name]
else:
newLine = list(line[0:6])
if controlBams:
signalLine = [float(x) for x in line[6:]]
rankbyIndexes = range(0,len(signalLine)/2,1)
controlIndexes = range(len(signalLine)/2,len(signalLine),1)
metaVector = []
for i,j in zip(rankbyIndexes,controlIndexes):
#min signal is 0
metaVector.append(max(0,signalLine[i] - signalLine[j]))
metaSignal = numpy.mean(metaVector)
else:
metaSignal = numpy.mean([float(x) for x in line[6:]])
regionMap[n] = newLine + [metaSignal]
outputFile = string.replace(regionMapFile,'REGION','META')
utils.unParseTable(regionMap,outputFile,'\t')
return(outputFile)
开发者ID:linlabcode,项目名称:pipeline,代码行数:36,代码来源:ROSE2_META.py
示例14: mapCollection
def mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, output, refName):
'''
makes a table of factor density in a stitched locus and ranks table by number of loci stitched together
'''
print('FORMATTING TABLE')
loci = stitchedCollection.getLoci()
locusTable = [['REGION_ID', 'CHROM', 'START', 'STOP', 'NUM_LOCI', 'CONSTITUENT_SIZE']]
lociLenList = []
# strip out any that are in chrY
for locus in list(loci):
if locus.chr() == 'chrY':
loci.remove(locus)
for locus in loci:
# numLociList.append(int(stitchLocus.ID().split('_')[1]))
lociLenList.append(locus.len())
# numOrder = order(numLociList,decreasing=True)
lenOrder = utils.order(lociLenList, decreasing=True)
ticker = 0
for i in lenOrder:
ticker += 1
if ticker % 1000 == 0:
print(ticker)
locus = loci[i]
# First get the size of the enriched regions within the stitched locus
refEnrichSize = 0
refOverlappingLoci = referenceCollection.getOverlap(locus, 'both')
for refLocus in refOverlappingLoci:
refEnrichSize += refLocus.len()
try:
stitchCount = int(locus.ID().split('_')[0])
except ValueError:
stitchCount = 1
coords = [int(x) for x in locus.coords()]
locusTable.append([locus.ID(), locus.chr(), min(coords), max(coords), stitchCount, refEnrichSize])
print('GETTING MAPPED DATA')
print("USING A BAMFILE LIST:")
print(bamFileList)
for bamFile in bamFileList:
bamFileName = bamFile.split('/')[-1]
print('GETTING MAPPING DATA FOR %s' % bamFile)
# assumes standard convention for naming enriched region gffs
# opening up the mapped GFF
print('OPENING %s%s_%s_MAPPED/matrix.txt' % (mappedFolder, refName, bamFileName))
mappedGFF = utils.parseTable('%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, refName, bamFileName), '\t')
signalDict = defaultdict(float)
print('MAKING SIGNAL DICT FOR %s' % (bamFile))
mappedLoci = []
for line in mappedGFF[1:]:
chrom = line[1].split('(')[0]
start = int(line[1].split(':')[-1].split('-')[0])
end = int(line[1].split(':')[-1].split('-')[1])
mappedLoci.append(utils.Locus(chrom, start, end, '.', line[0]))
try:
signalDict[line[0]] = float(line[2]) * (abs(end - start))
except ValueError:
print('WARNING NO SIGNAL FOR LINE:')
print(line)
continue
mappedCollection = utils.LocusCollection(mappedLoci, 500)
locusTable[0].append(bamFileName)
for i in range(1, len(locusTable)):
signal = 0.0
line = locusTable[i]
lineLocus = utils.Locus(line[1], line[2], line[3], '.')
overlappingRegions = mappedCollection.getOverlap(lineLocus, sense='both')
for region in overlappingRegions:
signal += signalDict[region.ID()]
locusTable[i].append(signal)
utils.unParseTable(locusTable, output, '\t')
开发者ID:linlabcode,项目名称:pipeline,代码行数:87,代码来源:ROSE2_META.py
示例15: optimizeStitching
def optimizeStitching(locusCollection, name, outFolder, stepSize=500):
'''
takes a locus collection and starts writing out stitching stats at step sized intervals
'''
maxStitch = 15000 # set a hard wired match stitching parameter
stitchTable = [['STEP', 'NUM_REGIONS', 'TOTAL_CONSTIT', 'TOTAL_REGION', 'MEAN_CONSTIT', 'MEDIAN_CONSTIT', 'MEAN_REGION', 'MEDIAN_REGION', 'MEAN_STITCH_FRACTION', 'MEDIAN_STITCH_FRACTION']]
# first consolidate the collection
locusCollection = locusCollection.stitchCollection(stitchWindow=0)
total_constit = sum([locus.len() for locus in locusCollection.getLoci()])
step = 0
while step <= maxStitch:
print("Getting stitch stats for %s (bp)" % (step))
stitchCollection = locusCollection.stitchCollection(stitchWindow=step)
num_regions = len(stitchCollection)
stitchLoci = stitchCollection.getLoci()
regionLengths = [locus.len() for locus in stitchLoci]
total_region = sum(regionLengths)
constitLengths = []
for locus in stitchLoci:
constitLoci = locusCollection.getOverlap(locus)
constitLengths.append(sum([locus.len() for locus in constitLoci]))
meanConstit = round(numpy.mean(constitLengths), 2)
medianConstit = round(numpy.median(constitLengths), 2)
meanRegion = round(numpy.mean(regionLengths), 2)
medianRegion = round(numpy.median(regionLengths), 2)
stitchFractions = [float(constitLengths[i]) / float(regionLengths[i]) for i in range(len(regionLengths))]
meanStitchFraction = round(numpy.mean(stitchFractions), 2)
medianStitchFraction = round(numpy.median(stitchFractions), 2)
newLine = [step, num_regions, total_constit, total_region, meanConstit, medianConstit, meanRegion, medianRegion, meanStitchFraction, medianStitchFraction]
stitchTable.append(newLine)
step += stepSize
# write the stitch table to disk
stitchParamFile = '%s%s_stitch_params.tmp' % (outFolder, name)
utils.unParseTable(stitchTable, stitchParamFile, '\t')
# call the rscript
rCmd = 'Rscript ./ROSE2_stitchOpt.R %s %s %s' % (stitchParamFile, outFolder, name)
print(rCmd)
# get back the stitch parameter
rOutput = subprocess.Popen(rCmd, stdout=subprocess.PIPE, shell=True)
rOutputTest = rOutput.communicate()
print(rOutputTest)
stitchParam = rOutputTest[0].split('\n')[2]
try:
stitchParam = int(stitchParam)
except ValueError:
print("INVALID STITCHING PARAMETER. STITCHING OPTIMIZATION FAILED")
sys.exit()
# delete? the table
# os.system('rm -f %s' % (stitchParamFile))
return stitchParam
开发者ID:linlabcode,项目名称:pipeline,代码行数:63,代码来源:ROSE2_META.py
示例16: int
import utils
from sys import argv
filename = argv[1]
outname = filename[:-3] + 'sorted.bed'
bedfile = utils.parseTable(filename, '\t')
out = []
for line in bedfile:
coords = [int(line[1]), int(line[2])]
start = min(coords)
end = max(coords)
newline = [line[0], start, end] + line[3:]
out.append(newline)
utils.unParseTable(out, outname, '\t')
开发者ID:afederation,项目名称:bradner_utils,代码行数:20,代码来源:sortCoordsBED.py
示例17: collapseFimo
def collapseFimo(fimo_output,gene_to_enhancer_dict,candidate_tf_list,output_folder,analysis_name,motifConvertFile):
'''
collapses motifs from fimo
for each source node (TF) and each target node (gene enhancer regions), collapse motif instances
then spit out a ginormous set of beds and a single crazy collapsed bed
'''
#first build up the motif name conversion database
motifDatabase = utils.parseTable(motifConvertFile, '\t')
motifDatabaseDict = defaultdict(list)
# The reverse of the other dict, from motif name to gene name
# a motif can go to multiple genes
for line in motifDatabase:
motifDatabaseDict[line[0]].append(line[1])
#make the folder to store motif beds
utils.formatFolder('%smotif_beds/' % (output_folder),True)
edgeDict = {}
#first layer are source nodes
for tf in candidate_tf_list:
edgeDict[tf] = defaultdict(list) #next layer are target nodes which are derived from the fimo output
fimoTable = utils.parseTable(fimo_output,'\t')
print(fimo_output)
#fimo sometimes puts the region in either the first or second column
fimo_line = fimoTable[1]
if fimo_line[1].count('|') >0:
region_index = 1
else:
region_index = 2
print('USING COLUMN %s OF FIMO OUTPUT FOR REGION' % (region_index))
for line in fimoTable[1:]:
source_tfs = motifDatabaseDict[line[0]] #motifId
for source in source_tfs:
if candidate_tf_list.count(source) == 0:
continue
region = line[region_index].split('|')
target = region[0]
if region_index == 2:
target_locus = utils.Locus(region[1],int(region[2]) + int(line[3]), int(region[2]) + int(line[4]),'.')
else:
target_locus = utils.Locus(region[1],int(region[2]) + int(line[2]), int(region[2]) + int(line[3]),'.')
#what's missing here is the enhancer id of the target locus
try:
edgeDict[source][target].append(target_locus)
except KeyError:
print('this motif is not in the network')
print(line)
sys.exit()
#now we actually want to collapse this down in a meaningful way
#overlapping motifs count as a single binding site. This way a TF with tons of motifs
#that finds the same site over and over again doesn't get over counted
all_bed = []
all_bed_path = '%s%s_all_motifs.bed' % (output_folder,analysis_name)
for tf in candidate_tf_list:
print(tf)
target_nodes = edgeDict[tf].keys()
bed_header = ['track name = "%s" description="%s motifs in %s"' % (tf,tf,analysis_name)]
all_bed.append(bed_header)
target_bed = [bed_header]
target_bed_path = '%smotif_beds/%s_motifs.bed' % (output_folder,tf)
for target in target_nodes:
edgeCollection = utils.LocusCollection(edgeDict[tf][target],50)
edgeCollection = edgeCollection.stitchCollection()
edgeLoci = edgeCollection.getLoci()
edgeDict[tf][target] = edgeLoci
for locus in edgeLoci:
bed_line = [locus.chr(),locus.start(),locus.end(),target,'','+']
target_bed.append(bed_line)
all_bed.append(bed_line)
utils.unParseTable(target_bed,target_bed_path,'\t')
#now the loci are all stitched up
utils.unParseTable(all_bed,all_bed_path,'\t')
return edgeDict
开发者ID:linlabcode,项目名称:pipeline,代码行数:87,代码来源:CRC3.py
示例18: main
def main():
from optparse import OptionParser
usage = "usage: %prog [options] -b [SORTED BAMFILE] -i [INPUTFILE] -o [OUTPUTFILE]"
parser = OptionParser(usage = usage)
#required flags
parser.add_option("-b","--bam", dest="bam",nargs = 1, default=None,
help = "Enter .bam file to be processed.")
parser.add_option("-i","--input", dest="input",nargs = 1, default=None,
help = "Enter .gff or ENRICHED REGION file to be processed.")
#output flag
parser.add_option("-o","--output", dest="output",nargs = 1, default=None,
help = "Enter the output filename.")
#additional options
parser.add_option("-s","--sense", dest="sense",nargs = 1, default='.',
help = "Map to '+','-' or 'both' strands. Default maps to both.")
parser.add_option("-e","--extension", dest="extension",nargs = 1, default=200,
help = "Extends reads by n bp. Default value is 200bp")
parser.add_option("-r","--rpm", dest="rpm",action = 'store_true', default=False,
help = "Normalizes density to reads per million (rpm)")
parser.add_option("-c","--cluster", dest="cluster",nargs = 1, default=None,
help = "Outputs a fixed bin size clustergram. user must specify bin size.")
parser.add_option("-m","--matrix", dest="matrix",nargs = 1, default=None,
help = "Outputs a variable bin sized matrix. User must specify number of bins.")
(options,args) = parser.parse_args()
print(options)
print(args)
if options.sense:
if ['+','-','.','both'].count(options.sense) == 0:
print('ERROR: sense flag must be followed by +,-,.,both')
parser.print_help()
exit()
if options.cluster and options.matrix:
print('ERROR: Cannot specify both matrix and clustergram flags.')
parser.print_help()
exit()
if options.matrix:
try:
int(options.matrix)
except:
print('ERROR: User must specify an integer bin number for matrix (try 50)')
parser.print_help()
exit()
if options.cluster:
try:
int(options.cluster)
except:
print('ERROR: User must specify an integer bin size for clustergram (try 25)')
parser.print_help()
exit()
if options.input and options.bam:
inputFile = options.input
if inputFile.split('.')[-1] != 'gff':
print('converting file to a .gff')
gffFile = convertEnrichedRegionsToGFF(inputFile)
else:
gffFile = inputFile
bamFile = options.bam
if options.output == None:
output = os.getcwd() + inputFile.split('/')[-1]+'.mapped'
else:
output = options.output
if options.cluster:
print('mapping to GFF and making clustergram with fixed bin width')
newGFF = mapBamToGFF(bamFile,gffFile,options.sense,int(options.extension),options.rpm,int(options.cluster),None)
elif options.matrix:
print('mapping to GFF and making a matrix with fixed bin number')
newGFF = mapBamToGFF(bamFile,gffFile,options.sense,int(options.extension),options.rpm,None,int(options.matrix))
print('bamToGFF_turbo writing output to: %s' % (output))
# Hackjob to make subdirectories for ROSE integration
try:
os.mkdir(os.path.dirname(output))
except OSError:
pass
utils.unParseTable(newGFF,output,'\t')
else:
parser.print_help()
开发者ID:linlabcode,项目名称:pipeline,代码行数:89,代码来源:bamToGFF_turbo.py
示例19: mapGFFLineToAnnot
def mapGFFLineToAnnot(gffLine, outFolder, nBins, geneDict, txCollection, sense='both', header=''):
'''
for every line produces a file with all of the rectangles to draw
'''
if len(header) == 0:
gffString = '%s_%s_%s_%s' % (gffLine[0], gffLine[6], gffLine[3], gffLine[4])
else:
gffString = header
diagramTable = [[0, 0, 0, 0]]
nameTable = [['', 0, 0]]
gffLocus = utils.Locus(gffLine[0], int(gffLine[3]), int(gffLine[4]), gffLine[6], gffLine[1])
scaleFactor = float(nBins) / gffLocus.len()
# plotting buffer for diagrams
plotBuffer = int(gffLocus.len() / float(nBins) * 20)
overlapLoci = txCollection.getOverlap(gffLocus, sense='both')
geneList = [locus.ID() for locus in overlapLoci]
if gffLine[6] == '-':
refPoint = int(gffLine[4])
else:
refPoint = int(gffLine[3])
offsetCollection = utils.LocusCollection([], 500)
for geneID in geneList:
gene = geneDict[geneID]
print(gene.commonName())
if len(gene.commonName()) > 1:
name = gene.commonName()
else:
name = geneID
offset = 4 * len(offsetCollection.getOverlap(gene.txLocus()))
offsetCollection.append(utils.makeSearchLocus(gene.txLocus(), plotBuffer, plotBuffer))
# write the name of the gene down
if gene.sense() == '+':
geneStart = gene.txLocus().start()
else:
geneStart = gene.txLocus().end()
geneStart = abs(geneStart - refPoint) * scaleFactor
nameTable.append([name, geneStart, -2 - offset])
# draw a line across the entire txLocus
[start, stop] = [abs(x - refPoint) * scaleFactor for x in gene.txLocus().coords()]
diagramTable.append([start, -0.01 - offset, stop, 0.01 - offset])
# now draw thin boxes for all txExons
if len(gene.txExons()) > 0:
for txExon in gene.txExons():
[start, stop] = [abs(x - refPoint) * scaleFactor for x in txExon.coords()]
diagramTable.append([start, -0.5 - offset, stop, 0.5 - offset])
# now draw fatty boxes for the coding exons if any
if len(gene.cdExons()) > 0:
for cdExon in gene.cdExons():
[start, stop] = [abs(x - refPoint) * scaleFactor for x in cdExon.coords()]
diagramTable.append([start, -1 - offset, stop, 1 - offset])
utils.unParseTable(diagramTable, outFolder + gffString + '_diagramTemp.txt', '\t')
utils.unParseTable(nameTable, outFolder + gffString + '_nameTemp.txt', '\t')
开发者ID:afederation,项目名称:pipeline,代码行数:66,代码来源:bamPlot_turbo.py
示例20: finishRankOutput
#.........这里部分代码省略.........
gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString]
gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString]
gainedGFF.append(gffLine)
gainedWindowGFF.append(gffWindowLine)
geneStatus = name2
gainedBed.append(bedLine)
#for lost
elif float(line[-8]) < (-1 * cutOff) and int(line[-4]) == 1:
gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString]
gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString]
lostGFF.append(gffLine)
lostWindowGFF.append(gffWindowLine)
geneStatus = name1
lostBed.append(bedLine)
#for conserved
else:
geneStatus = 'UNCHANGED'
conservedBed.append(bedLine)
#now fill in the gene Table
for gene in geneList:
geneTableLine = [gene,line[0],line[1],line[2],line[3],line[6],line[7],line[8],geneStatus]
geneTable.append(geneTableLine)
#concat the bed
fullBed = gainedBed + conservedBed + lostBed
|
请发表评论