本文整理汇总了Python中pybedtools.cleanup函数的典型用法代码示例。如果您正苦于以下问题:Python cleanup函数的具体用法?Python cleanup怎么用?Python cleanup使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了cleanup函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_cleanup
def test_cleanup():
"""
make sure the tempdir and cleanup work
"""
assert os.path.abspath(pybedtools.get_tempdir()) == os.path.abspath('.')
# make a fake tempfile, not created during this pybedtools session
testfn = 'pybedtools.TESTING.tmp'
os.system('touch %s' % testfn)
assert os.path.exists(testfn)
# make some temp files
a = pybedtools.BedTool(os.path.join(testdir, 'data', 'a.bed'))
b = pybedtools.BedTool(os.path.join(testdir, 'data', 'b.bed'))
c = a.intersect(b)
# after standard cleanup, c's fn should be gone but the fake one still
# there...
pybedtools.cleanup(verbose=True)
assert os.path.exists(testfn)
assert not os.path.exists(c.fn)
# Unless we force the removal of all temp files.
pybedtools.cleanup(remove_all=True)
assert not os.path.exists(testfn)
# a.fn and b.fn better be there still!
assert os.path.exists(a.fn)
assert os.path.exists(b.fn)
开发者ID:Fabrices,项目名称:pybedtools,代码行数:29,代码来源:test_helpers.py
示例2: main
def main():
"""
Third quick example from the documentation -- count reads introns and
exons, in parallel
"""
ap = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
usage=__doc__)
ap.add_argument('--gff', required=True,
help='GFF or GTF file containing annotations')
ap.add_argument('--bam', required=True,
help='BAM file containing reads to be counted')
ap.add_argument('--stranded', action='store_true',
help='Use strand-specific merging and overlap. '
'Default is to ignore strand')
ap.add_argument('--no-parallel', dest='noparallel', action='store_true',
help='Disables parallel computation')
ap.add_argument('-o', '--output',
help='Optional file to which results will be written; '
'default is stdout')
ap.add_argument('-v', '--verbose', action='store_true',
help='Verbose (goes to stderr)')
args = ap.parse_args()
gff = args.gff
bam = args.bam
stranded = args.stranded
parallel = not args.noparallel
# Some GFF files have invalid entries -- like chromosomes with negative
# coords or features of length = 0. This line removes them and saves the
# result in a tempfile
g = pybedtools.BedTool(gff).remove_invalid().saveas()
# Decide which version of map to use. If parallel, we only need 3
# processes.
pool = multiprocessing.Pool(processes=3)
# Get separate files for introns and exons in parallel (if specified)
featuretypes = ('intron', 'exon')
introns, exons = pool.map(subset_featuretypes, featuretypes)
# Perform some genome algebra to get unique and shared regions
exon_only = exons.subtract(introns).merge().remove_invalid().saveas()
intron_only = introns.subtract(exons).merge().remove_invalid().saveas()
intron_and_exon = exons\
.intersect(introns).merge().remove_invalid().saveas()
# Do intersections with BAM file in parallel
features = (exon_only, intron_only, intron_and_exon)
results = pool.map(count_reads_in_features, features)
labels = (' exon only:',
' intron only:',
'intron and exon:')
for label, reads in zip(labels, results):
print('%s %s' % (label, reads))
pybedtools.cleanup(verbose=False)
开发者ID:PanosFirmpas,项目名称:pybedtools,代码行数:59,代码来源:intron_exon_reads.py
示例3: run_spades_parallel
def run_spades_parallel(bam=None, spades=None, bed=None, work=None, pad=SPADES_PAD, nthreads=1, chrs=[],
max_interval_size=SPADES_MAX_INTERVAL_SIZE,
timeout=SPADES_TIMEOUT, isize_min=ISIZE_MIN, isize_max=ISIZE_MAX,
svs_to_assemble=SVS_ASSEMBLY_SUPPORTED,
stop_on_fail=False, max_read_pairs=EXTRACTION_MAX_READ_PAIRS):
pybedtools.set_tempdir(work)
logger.info("Running SPAdes on the intervals in %s" % bed)
if not bed:
logger.info("No BED file specified")
return None, None
bedtool = pybedtools.BedTool(bed)
total = bedtool.count()
chrs = set(chrs)
all_intervals = [interval for interval in bedtool] if not chrs else [interval for interval in bedtool if
interval.chrom in chrs]
selected_intervals = filter(partial(should_be_assembled, max_interval_size=max_interval_size, svs_to_assemble=svs_to_assemble),
all_intervals)
ignored_intervals = filter(partial(shouldnt_be_assembled, max_interval_size=max_interval_size, svs_to_assemble=svs_to_assemble),
all_intervals)
pool = multiprocessing.Pool(nthreads)
assembly_fastas = []
for i in xrange(nthreads):
intervals = [interval for (j, interval) in enumerate(selected_intervals) if (j % nthreads) == i]
kwargs_dict = {"intervals": intervals, "bam": bam, "spades": spades, "work": "%s/%d" % (work, i), "pad": pad,
"timeout": timeout, "isize_min": isize_min, "isize_max": isize_max, "stop_on_fail": stop_on_fail,
"max_read_pairs": max_read_pairs}
pool.apply_async(run_spades_single, kwds=kwargs_dict,
callback=partial(run_spades_single_callback, result_list=assembly_fastas))
pool.close()
pool.join()
logger.info("Merging the contigs from %s" % (str(assembly_fastas)))
assembled_fasta = os.path.join(work, "spades_assembled.fa")
with open(assembled_fasta, "w") as assembled_fd:
for line in fileinput.input(assembly_fastas):
assembled_fd.write("%s\n" % (line.strip()))
if os.path.getsize(assembled_fasta) > 0:
logger.info("Indexing the assemblies")
pysam.faidx(assembled_fasta)
else:
logger.error("No assembly generated")
assembled_fasta = None
ignored_bed = None
if ignored_intervals:
ignored_bed = os.path.join(work, "ignored.bed")
pybedtools.BedTool(ignored_intervals).each(add_breakpoints).saveas(ignored_bed)
pybedtools.cleanup(remove_all=True)
return assembled_fasta, ignored_bed
开发者ID:thongnt2,项目名称:metasv,代码行数:57,代码来源:run_spades.py
示例4: main
def main():
args = argparser()
forwardsites = findSites(args.input, args.reference, "+")
revsites = findSites(args.input, args.reference, "-")
try:
with open(args.output,'wb') as of:
for l in forwardsites + revsites:
of.write(l + "\n")
except TypeError:
for l in forwardsites + revsites:
args.output.write(l + "\n")
pybedtools.cleanup(remove_all=True)
开发者ID:adamjorr,项目名称:zypy,代码行数:12,代码来源:degenerate.py
示例5: run_spades_parallel
def run_spades_parallel(bam=None, spades=None, bed=None, work=None, pad=SPADES_PAD, nthreads=1, chrs=[], max_interval_size=50000,
timeout=SPADES_TIMEOUT, isize_min=ISIZE_MIN, isize_max=ISIZE_MAX, disable_deletion_assembly=False, stop_on_fail=False):
pybedtools.set_tempdir(work)
bedtool = pybedtools.BedTool(bed)
total = bedtool.count()
chrs = set(chrs)
all_intervals = [interval for interval in bedtool] if not chrs else [interval for interval in bedtool if
interval.chrom in chrs]
selected_intervals = filter(partial(should_be_assembled, disable_deletion_assembly=disable_deletion_assembly), all_intervals)
ignored_intervals = filter(partial(shouldnt_be_assembled, disable_deletion_assembly=disable_deletion_assembly), all_intervals)
pool = multiprocessing.Pool(nthreads)
assembly_fastas = []
for i in xrange(nthreads):
intervals = [interval for (j, interval) in enumerate(selected_intervals) if (j % nthreads) == i]
kwargs_dict = {"intervals": intervals, "bam": bam, "spades": spades, "work": "%s/%d" % (work, i), "pad": pad,
"timeout": timeout, "isize_min": isize_min, "isize_max": isize_max, "stop_on_fail": stop_on_fail}
pool.apply_async(run_spades_single, kwds=kwargs_dict,
callback=partial(run_spades_single_callback, result_list=assembly_fastas))
pool.close()
pool.join()
logger.info("Merging the contigs from %s" % (str(assembly_fastas)))
assembled_fasta = os.path.join(work, "spades_assembled.fa")
with open(assembled_fasta, "w") as assembled_fd:
for line in fileinput.input(assembly_fastas):
assembled_fd.write("%s\n" % (line.strip()))
logger.info("Indexing the assemblies")
pysam.faidx(assembled_fasta)
ignored_bed = None
if ignored_intervals:
ignored_bed = os.path.join(work, "ignored.bed")
pybedtools.BedTool(ignored_intervals).each(add_breakpoints).saveas(ignored_bed)
pybedtools.cleanup(remove_all=True)
return assembled_fasta, ignored_bed
开发者ID:BioinformaticsArchive,项目名称:metasv,代码行数:42,代码来源:run_spades.py
示例6: GetRatioGenome
def GetRatioGenome():
pulldown = pysam.Samfile(args.pulldown)
control = pysam.Samfile(args.control)
if args.tot_pulldown is not None:
tot_pulldown = args.tot_pulldown
tot_control = args.control
else:
tot_pulldown = pulldown.mapped + pulldown.unmapped
tot_control = control.mapped + control.unmapped
print >>sys.stderr, "Total number of reads in pulldown sample: %d" % (tot_pulldown)
print >>sys.stderr, "Total number of reads in control sample: %d" % (tot_control)
# get spike-in read within bed file
pulldown_bam = pybedtools.BedTool(args.pulldown)
control_bam = pybedtools.BedTool(args.control)
spike_pulldown = pulldown_bam.intersect(args.pos,f=0.5).count()
spike_control = control_bam.intersect(args.pos,f=0.5).count()
print >>sys.stderr, "Total number of reads mapped in spike-in in pulldown sample: %d" % (spike_pulldown)
print >>sys.stderr, "Total number of reads mapped in spike-in in control sample: %d" % (spike_control)
ratio = float(spike_control)/float(spike_pulldown)*float(tot_pulldown)/float(tot_control)
print >>sys.stderr, "Ratio is %.6f" % (ratio)
pulldown.close()
control.close()
pybedtools.cleanup()
开发者ID:zocean,项目名称:Cytoruler,代码行数:23,代码来源:TSA-seq_spike_in_get_ratio.py
示例7: vcf_to_df_worker
def vcf_to_df_worker(arg):
""" Convert CANVAS vcf to a dict, single thread
"""
canvasvcf, exonbed, i = arg
logging.debug("Working on job {}: {}".format(i, canvasvcf))
samplekey = op.basename(canvasvcf).split(".")[0].rsplit('_', 1)[0]
d = {'SampleKey': samplekey}
exons = BedTool(exonbed)
cn = parse_segments(canvasvcf)
overlaps = exons.intersect(cn, wao=True)
gcn_store = {}
for ov in overlaps:
# Example of ov.fields:
# [u'chr1', u'11868', u'12227', u'ENSG00000223972.5',
# u'ENST00000456328.2', u'transcribed_unprocessed_pseudogene',
# u'DDX11L1', u'.', u'-1', u'-1', u'.', u'0']
gene_name = "|".join((ov.fields[6], ov.fields[3], ov.fields[5]))
if gene_name not in gcn_store:
gcn_store[gene_name] = defaultdict(int)
cn = ov.fields[-2]
if cn == ".":
continue
cn = int(cn)
if cn > 10:
cn = 10
amt = int(ov.fields[-1])
gcn_store[gene_name][cn] += amt
for k, v in sorted(gcn_store.items()):
v_mean, v_median = counter_mean_and_median(v)
d[k + ".avgcn"] = v_mean
d[k + ".medcn"] = v_median
cleanup()
return d
开发者ID:xuanblo,项目名称:jcvi,代码行数:36,代码来源:cnv.py
示例8: bin_frag
def bin_frag(outdir, bin_bed, all_frag_bed, fragcount_bed):
logging.info("output directory: %s", outdir)
logging.info("Bin file: %s", bin_bed)
logging.info("All restriction fragments bed file: %s", all_frag_bed)
logging.info("Number of fragdata files: %d", len(fragcount_bed))
os.mkdir(outdir)
# open bins file
bins = pbt.BedTool(bin_bed)
logging.info("read in %8d bins", len(bins))
# open all frag file
all_frag = pbt.BedTool(all_frag_bed)
logging.info("read in %8d restriction fragments", len(all_frag))
# match up bins with restriction fragments
#TODO: stats on the result
bins_with_any_frag = count_frags_per_bin(bins, all_frag)
logging.info("bins that contained any fragments: %d", len(bins_with_any_frag))
make_bedgraph_files(fragcount_bed, bins_with_any_frag, outdir)
# cleanup
pbt.cleanup(remove_all = True)
开发者ID:wresch,项目名称:4C,代码行数:24,代码来源:bin.py
示例9: extract_target_genes_transcripts
def extract_target_genes_transcripts(dicoNiourk):
if dicoNiourk["target"]!="":
print("\x1b[0;38;2;"+dicoNiourk["color"]["light1"]+"m") ; sys.stdout.write("\033[F")
dicoNiourk["spinner"].text = " • Extract target genes"
dicoNiourk["spinner"].start()
# Find intersecation between gff and target bed
bed = BedTool(dicoNiourk["target"])
genes = BedTool(dicoNiourk["refseq_gff"])
dicoNiourk["target_gene"] = {} # id to name
dicoNiourk["target_transcript"] = {} # id to name
dico_intersect_transcript = {}
# Search gff exons intresection
for intersect_elem in genes+bed:
if intersect_elem.fields[2]=="exon":
exon = dicoNiourk["db_gff"][intersect_elem.attrs["ID"]]
# retrieve correspunding transcript
for rna in dicoNiourk["db_gff"].parents(exon, featuretype='mRNA', order_by='start'):
try: dico_intersect_transcript[rna]+=1
except: dico_intersect_transcript[rna] = 1
#***** FILTER transcript which all coding exons in target *****#
for rna in dico_intersect_transcript:
# retrieve parent gene
gene = list(dicoNiourk["db_gff"].parents(rna, featuretype='gene', order_by='start'))[0]
cds_start = list(dicoNiourk["db_gff"].children(rna, featuretype='CDS', order_by='start'))[0].start
cds_end = list(dicoNiourk["db_gff"].children(rna, featuretype='CDS', order_by='start'))[-1].end
# Count coding exon
nb_coding_exons = 0
for exon in dicoNiourk["db_gff"].children(rna, featuretype='exon', order_by='start'):
if exon.end>cds_start and exon.start<cds_end: nb_coding_exons+=1
# Filtre transcripts and genes
if dico_intersect_transcript[rna]>=nb_coding_exons:
dicoNiourk["target_transcript"][rna.attributes["Name"][0]] = rna.id
for gene in dicoNiourk["db_gff"].parents(rna, featuretype='gene', order_by='start'): dicoNiourk["target_gene"][gene.attributes["Name"][0]] = gene.id
dicoNiourk["spinner"].stop()
printcolor(" • "+str(len(dicoNiourk["target_gene"]))+"genes/"+str(len(dicoNiourk["target_transcript"]))+"rnas extracted\n","0",dicoNiourk["color"]["light1"],None,dicoNiourk["color"]["bool"])
cleanup(remove_all=True) # delete created temp file
开发者ID:dooguypapua,项目名称:Niourk,代码行数:36,代码来源:Niourk_bed_gff.py
示例10: generate_sc_intervals
#.........这里部分代码省略.........
unmerged_bed = os.path.join(workdir, "unmerged.bed")
bedtool = pybedtools.BedTool(unmerged_intervals).sort().moveto(unmerged_bed)
func_logger.info("%d candidate reads" % (bedtool.count()))
bedtool_lr={"L":bedtool.filter(lambda x: int(x.name.split(",")[0])<=int(x.name.split(",")[1])).sort(),
"R":bedtool.filter(lambda x: int(x.name.split(",")[0])>int(x.name.split(",")[1])).sort()}
bp_merged_intervals = []
for k_bt,bt in bedtool_lr.iteritems():
merged_bed = os.path.join(workdir, "merged_%s.bed"%k_bt)
m_bt=merge_for_each_sv(bt,c="4,5,6,7",o="collapse,sum,collapse,collapse",
svs_to_softclip=svs_to_softclip,d=merge_max_dist,
reciprocal_for_2bp=False, sv_type_field = [6,0])
m_bt = m_bt.moveto(merged_bed)
func_logger.info("%d merged intervals with left bp support" % (m_bt.count()))
# Check if the other break point also can be merged for the merged intervals (for 2bp SVs)
for interval in m_bt:
sv_type = interval.fields[6].split(',')[0]
if len(set(interval.fields[6].split(',')))!=1:
func_logger.warn("More than one svtypes: %s",(str(interval)))
if sv_type == "INS":
bp_merged_intervals.append(interval)
else:
name_fields_0 = interval.name.split(',')
other_bps = map(lambda x:int(name_fields_0[3*x+1]), range(len(name_fields_0)/3))
if (min(other_bps)+2*pad-max(other_bps))>(-merge_max_dist):
bp_merged_intervals.append(interval)
continue
other_bp_bedtool=bt.filter(lambda x: x.name in interval.name and x.fields[6]==sv_type).each(partial(generate_other_bp_interval,pad=pad)).sort().merge(c="4,5,6,7", o="collapse,sum,collapse,collapse", d=merge_max_dist)
if len(other_bp_bedtool)==1:
bp_merged_intervals.append(interval)
else:
for intvl in other_bp_bedtool:
bp_merged_intervals.extend(bt.filter(lambda x: x.name in intvl.name and x.fields[6]==sv_type).sort().merge(c="4,5,6,7", o="collapse,sum,collapse,collapse", d=merge_max_dist))
bp_merged_bed = os.path.join(workdir, "bp_merged.bed")
bedtool=pybedtools.BedTool(bp_merged_intervals).each(partial(add_other_bp_fields,pad=pad)).sort().moveto(bp_merged_bed)
func_logger.info("%d BP merged intervals" % (bedtool.count()))
filtered_bed = os.path.join(workdir, "filtered.bed")
bedtool = bedtool.filter(lambda x: int(x.score) >= MIN_SUPPORT_SC_ONLY).each(
partial(merged_interval_features, bam_handle=sam_file)).moveto(
filtered_bed)
func_logger.info("%d filtered intervals" % (bedtool.count()))
# Now filter based on coverage
coverage_filtered_bed = os.path.join(workdir, "coverage_filtered.bed")
bedtool = bedtool.filter(lambda x: (x.fields[3].split(",")[1]!="INS" or
((min_ins_cov_frac*mean_read_coverage)<=(float(x.fields[6])/abs(x.start-x.end+1)*mean_read_length)<=(max_ins_cov_frac*mean_read_coverage)))).moveto(coverage_filtered_bed)
func_logger.info("%d coverage filtered intervals" % (bedtool.count()))
thr_sv={"INS":min_support_frac_ins, "INV":MIN_SUPPORT_FRAC_INV,
"DEL":MIN_SUPPORT_FRAC_DEL, "DUP": MIN_SUPPORT_FRAC_DUP}
# Add number of neighbouring reads that support SC
bedtool=bedtool.each(partial(add_neighbour_support,bam_handle=sam_file, min_mapq=min_mapq,
min_soft_clip=min_soft_clip, max_nm=max_nm, min_matches=min_matches,
skip_soft_clip=False, isize_mean=isize_mean, min_isize=min_isize, max_isize=max_isize)).sort().moveto(coverage_filtered_bed)
neigh_coverage_filtered_bed = os.path.join(workdir, "neigh_filtered.bed")
bedtool = bedtool.each(partial(filter_low_frac_support,thr_sv=thr_sv)).each(
partial(filter_low_neigh_read_support_INS,min_support_ins=min_support_ins)).sort().moveto(
neigh_coverage_filtered_bed)
func_logger.info("%d neighbour support filtered intervals" % (bedtool.count()))
# For 2bp SVs, the interval will be the cover of two intervals on the BP
full_filtered_bed = os.path.join(workdir, "full_neigh_filtered.bed")
bedtool = bedtool.each(partial(get_full_interval,pad=pad)).sort().moveto(full_filtered_bed)
func_logger.info("%d full filtered intervals" % (bedtool.count()))
# Now merge on full intervals
merged_full_filtered_bed = os.path.join(workdir, "merged_full.bed")
if bedtool.count()>0:
bedtool=merge_for_each_sv(bedtool,c="4,5,6,7,9",o="collapse,collapse,collapse,collapse,collapse",
svs_to_softclip=svs_to_softclip,
overlap_ratio=overlap_ratio,
reciprocal_for_2bp=True,
sv_type_field = [3,1], d=merge_max_dist)
bedtool = bedtool.each(partial(fix_merged_fields,inter_tools=False)).each(partial(fine_tune_bps,pad=pad))
bedtool = bedtool.filter(lambda x: x.score != "-1").sort().moveto(merged_full_filtered_bed)
func_logger.info("%d merged full intervals" % (bedtool.count()))
sam_file.close()
except Exception as e:
func_logger.error('Caught exception in worker thread')
# This prints the type, value, and stack trace of the
# current exception being handled.
traceback.print_exc()
print()
raise e
pybedtools.cleanup(remove_all=True)
func_logger.info("Generated intervals in %g seconds for region %s" % ((time.time() - start_time), chromosome))
return merged_full_filtered_bed
开发者ID:chapmanb,项目名称:metasv,代码行数:101,代码来源:generate_sv_intervals.py
示例11: teardown
def teardown():
pybedtools.cleanup(remove_all=True)
开发者ID:PanosFirmpas,项目名称:pybedtools,代码行数:2,代码来源:test_iter.py
示例12: run_age_parallel
def run_age_parallel(intervals_bed=None, reference=None, assembly=None, pad=AGE_PAD, age=None, age_workdir=None,
timeout=AGE_TIMEOUT, keep_temp=False, assembly_tool="spades", chrs=[], nthreads=1,
min_contig_len=AGE_MIN_CONTIG_LENGTH,
max_region_len=AGE_MAX_REGION_LENGTH, sv_types=[],
min_del_subalign_len=MIN_DEL_SUBALIGN_LENGTH, min_inv_subalign_len=MIN_INV_SUBALIGN_LENGTH,
age_window = AGE_WINDOW_SIZE):
func_logger = logging.getLogger("%s-%s" % (run_age_parallel.__name__, multiprocessing.current_process()))
if not os.path.isdir(age_workdir):
func_logger.info("Creating %s" % age_workdir)
os.makedirs(age_workdir)
if assembly:
if not os.path.isfile("%s.fai" % assembly):
func_logger.info("Assembly FASTA wasn't indexed. Will attempt to index now.")
pysam.faidx(assembly)
func_logger.info("Loading assembly contigs from %s" % assembly)
with open(assembly) as assembly_fd:
if assembly_tool == "spades":
contigs = [SpadesContig(line[1:]) for line in assembly_fd if line[0] == '>']
elif assembly_tool == "tigra":
contigs = [TigraContig(line[1:]) for line in assembly_fd if line[0] == '>']
else:
contigs = []
chrs = set(chrs)
sv_types = set(sv_types)
contig_dict = {contig.sv_region.to_tuple(): [] for contig in contigs if (len(
chrs) == 0 or contig.sv_region.chrom1 in chrs) and contig.sequence_len >= min_contig_len and contig.sv_region.length() <= max_region_len and (
len(sv_types) == 0 or contig.sv_type in sv_types)}
func_logger.info("Generating the contig dictionary for parallel execution")
small_contigs_count = 0
for contig in contigs:
if contig.sv_region.length() > max_region_len:
func_logger.info("Too large SV region length: %d > %d" % (contig.sv_region.length(),max_region_len))
continue
if (len(chrs) == 0 or contig.sv_region.chrom1 in chrs) and (len(sv_types) == 0 or contig.sv_type in sv_types):
if contig.sequence_len >= min_contig_len:
contig_dict[contig.sv_region.to_tuple()].append(contig)
else:
small_contigs_count += 1
region_list = sorted(contig_dict.keys())
nthreads = min(nthreads, len(region_list))
if nthreads == 0:
func_logger.warning("AGE not run since no contigs found")
return None
func_logger.info("Will process %d regions with %d contigs (%d small contigs ignored) using %d threads" % (
len(region_list), sum([len(value) for value in contig_dict.values()]), small_contigs_count, nthreads))
pybedtools.set_tempdir(age_workdir)
pool = multiprocessing.Pool(nthreads)
breakpoints_beds = []
for i in xrange(nthreads):
region_sublist = [region for (j, region) in enumerate(region_list) if (j % nthreads) == i]
kwargs_dict = {"intervals_bed": intervals_bed, "region_list": region_sublist, "contig_dict": contig_dict,
"reference": reference, "assembly": assembly, "pad": pad, "age": age, "age_workdir": age_workdir,
"timeout": timeout, "keep_temp": keep_temp, "myid": i,
"min_del_subalign_len": min_del_subalign_len, "min_inv_subalign_len": min_inv_subalign_len,
"age_window" : age_window}
pool.apply_async(run_age_single, args=[], kwds=kwargs_dict,
callback=partial(run_age_single_callback, result_list=breakpoints_beds))
pool.close()
pool.join()
func_logger.info("Finished parallel execution")
func_logger.info("Will merge the following breakpoints beds %s" % (str(breakpoints_beds)))
pybedtools.cleanup(remove_all=True)
if not breakpoints_beds:
return None
bedtool = pybedtools.BedTool(breakpoints_beds[0])
for bed_file in breakpoints_beds[1:]:
bedtool = bedtool.cat(pybedtools.BedTool(bed_file), postmerge=False)
bedtool = bedtool.moveto(os.path.join(age_workdir, "breakpoints_unsorted.bed"))
merged_bed = os.path.join(age_workdir, "breakpoints.bed")
bedtool.sort().saveas(merged_bed)
return merged_bed
开发者ID:sbandara,项目名称:metasv,代码行数:89,代码来源:run_age.py
示例13: main
#.........这里部分代码省略.........
sys.stderr.write("%s, RAver %s not in the genome database!\n" %
(genome, rv))
return 1
# create a tmp bed file with index column.
in_f = file(input_file_name)
# filter the comment lines
input_filtered = [
line for line in in_f if not (line.lstrip().startswith("#") or len(line.strip())==0)]
# if there is header, store it and remove it from the query BED.
if rhead:
headlineL = input_filtered[0].strip().split("\t")
del input_filtered[0]
# add index column to the bed lines
input_indexed = ['%s\t%d\n' % (line.strip(), i)
for i, line in enumerate(input_filtered)]
in_f.close()
# read all annotations into a dictionary, for the further output.
anno_bed = os.path.join(
db_path, genome + "." + anno_db + ".biotype_region_ext.bed")
try:
if not os.path.exists(anno_bed):
raise SystemExit
except SystemExit:
sys.stderr.write("%s genome not properly installed!\n" % genome)
return 1
# use saveas() to convert the BedTool objects to file-based objects,
# so they could be used multiple times.
# When debug, we may use saveas("tss.tmp"), and the output of bedtools
# could be saved.
pybedtools.set_tempdir("./")
anno = pybedtools.BedTool(anno_bed).saveas()
gd = pybedtools.BedTool(
os.path.join(db_path, genome + "_geneDesert.bed")).saveas()
pc = pybedtools.BedTool(
os.path.join(db_path, genome + "_pericentromere.bed")).saveas()
st = pybedtools.BedTool(
os.path.join(db_path, genome + "_subtelomere.bed")).saveas()
# load the input intervals to be annotated.
try:
input_bed = pybedtools.BedTool(
"".join(input_indexed), from_string=True).saveas()
except:
sys.stderr.write("Error in input file! Please check the format!")
return 1
list_input = [x.fields[:] for x in input_bed]
col_no_input = input_bed.field_count()
# get the midpoint of the intervals.
# there is a bug in midpoint function of pybedtools 0.6.3, so here an alternative function was used.
# input_bed_mid = input_bed.each(pybedtools.featurefuncs.midpoint).saveas()
input_bed_mid = pybedtools.BedTool(
"".join([regionanalysis.analysis.midpoint(x) for x in input_indexed]), from_string=True).saveas()
# intersectBed with annotations.
input_GB = input_bed_mid.intersect(anno, wao=True).saveas()
list_GB = [x.fields[:] for x in input_GB]
input_gd = input_bed_mid.intersect(gd, c=True, f=0.5).saveas()
list_gd = [x.fields[col_no_input + 0] for x in input_gd]
input_pc = input_bed_mid.intersect(pc, c=True, f=0.5).saveas()
list_pc = [x.fields[col_no_input + 0] for x in input_pc]
input_st = input_bed_mid.intersect(st, c=True, f=0.5).saveas()
list_st = [x.fields[col_no_input + 0] for x in input_st]
# groupby the intersectBed results based on the index column.
input_idx = key = lambda s: s[col_no_input - 1]
GB_dict = {}
for key, GB_hits in groupby(list_GB, key=input_idx):
GB_dict[key] = list(v for v in GB_hits)
output_file_best = file(input_file_name + ".annotated", "w")
output_file = file(input_file_name + ".full.annotated", "w")
output_file_json = file(input_file_name + ".full.annotated.json", "w")
# Output the header.
if rhead:
output_file.write("\t".join(
headlineL + ["GName", "TName", "Strand", "TSS", "TES", "Feature", "D2TSS", "Biotype", "GeneSymbol"]) + "\n")
output_file_best.write("\t".join(
headlineL + ["GName", "TName", "Strand", "TSS", "TES", "Feature", "D2TSS", "Biotype", "GeneSymbol"]) + "\n")
# write to the output: input.bed.annotated, input.bed.full.annotated.
json_dict = {}
for i in range(0, len(input_bed)):
output_lineL = list_input[i][:-1] # original input line
json_dict[str(i)] = {}
json_dict[str(i)]["query_interval"] = output_lineL
formatted, best_hit = regionanalysis.analysis.getBestHit(
anno_db, col_no_input, GB_dict[str(i)], list_gd[i], list_st[i], list_pc[i])
output_file_best.write("\t".join(output_lineL + best_hit) + "\n")
json_dict[str(i)]["best_hit"] = best_hit
for j in formatted:
output_file.write("\t".join(output_lineL + j) + "\n")
json_dict[str(i)]["all_hits"] = formatted
output_file_best.close()
output_file.close()
json.dump(json_dict, output_file_json, sort_keys=True, indent=2)
output_file_json.close()
pybedtools.cleanup()
return 0
开发者ID:Johan35,项目名称:region_analysis,代码行数:101,代码来源:region_analysis.py
示例14:
import pybedtools
pybedtools.set_tempdir('.')
a = pybedtools.bedtool('a.bed')
a.intersect(a)
pybedtools.cleanup(verbose=True)
开发者ID:radaniba,项目名称:pybedtools,代码行数:6,代码来源:test1.py
示例15: teardown_module
def teardown_module():
if os.path.exists(test_tempdir):
os.system('rm -r %s' % test_tempdir)
pybedtools.cleanup()
开发者ID:daler,项目名称:pybedtools,代码行数:4,代码来源:tfuncs.py
示例16: mergeVCFfiles
def mergeVCFfiles(chrom, A, B, file1, file2):
## Write headers to a file. Needed later for creating a correct VCF of the intersected files.
header1 = gzip.open((file1+chrom+'.vcf.gz'), 'r')
header2 = gzip.open((file2+chrom+'.vcf.gz'), 'r')
headerFile = (path+'HEADER_'+A+'_'+B+'_.vcf.gz')
f = gzip.open(headerFile, 'ab')
for line in header1:
if line[0:2] != '##':
break;
f.write(line)
header1.close()
for line in header2:
if line[0:2] != '##':
break;
f.write(line)
header1.close()
f.write('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO');
f.close()
## Intersects files -LOJ
a = BedTool((file1+chrom+'.vcf.gz'))
b = BedTool((file2+chrom+'.vcf.gz'))
a_b = a.intersect(b, header=True,loj=True)
ab = a_b.saveas((path+'LOJ_'+A+'_'+B+'_chr'+chrom+'.vcf.gz'))
print (ab.fn)
cleanup(remove_all=True)
## Intersects files -V
a = BedTool((file1+chrom+'.vcf.gz'))
b = BedTool((file2+chrom+'.vcf.gz'))
b_a = b.intersect(a, header=True, v=True)
ba = b_a.saveas((path+'V_'+A+'_'+B+'_chr'+chrom+'.vcf.gz'))
print (ba.fn)
cleanup(remove_all=True)
## CAT LOJ file and -v File
LOJ = path+'LOJ_'+A+'_'+B+'_chr'+chrom+'.vcf.gz'
V = path+'V_'+A+'_'+B+'_chr'+chrom+'.vcf.gz'
out = path+'concated_'+A+'_'+B+'_chr'+chrom+'.vcf.gz'
os.system(("vcf-concat "+LOJ+" "+V+" | gzip -c > "+out))
#print "ok"
## correct to vcf, merge equal type samples
out2 = 'stage2/unsorted_'+A+'_'+B+'_chr'+chrom+'.vcf.gz'
import correctToVCF
goVCF = correctToVCF.CorrectToVCF()
goVCF.writeHeader(headerFile, out2)
goVCF.correctFile(out, out2)
## sort the VCF file
os.system(("vcf-sort "+out2+"| gzip -c > "+("germlineVCF/"+A+'_'+B+'_chr'+chrom+'.vcf.gz')))
cleanup(remove_all=True)
开发者ID:oyvisorb,项目名称:thesis,代码行数:71,代码来源:mergeGermlineVCFfiles.py
示例17: parallel_generate_sc_intervals
def parallel_generate_sc_intervals(bams, chromosomes, skip_bed, workdir, num_threads=1, min_avg_base_qual=SC_MIN_AVG_BASE_QUAL,
min_mapq=SC_MIN_MAPQ, min_soft_clip=SC_MIN_SOFT_CLIP, max_soft_clip=SC_MAX_SOFT_CLIP, pad=SC_PAD,
min_support=MIN_SUPPORT, min_support_frac=MIN_SUPPORT_FRAC, max_intervals=MAX_INTERVALS):
func_logger = logging.getLogger(
"%s-%s" % (parallel_generate_sc_intervals.__name__, multiprocessing.current_process()))
if not os.path.isdir(workdir):
func_logger.info("Creating directory %s" % workdir)
os.makedirs(workdir)
if not chromosomes:
func_logger.info("Chromosome list unspecified. Inferring from the BAMs")
for bam in bams:
bamfile = pysam.Samfile(bam, "rb")
chromosomes += list(bamfile.references)
bamfile.close()
chromosomes = sorted(list(set(chromosomes)))
func_logger.info("Chromosome list inferred as %s" % (str(chromosomes)))
if not chromosomes:
func_logger.error("Chromosome list empty")
return None
pool = multiprocessing.Pool(num_threads)
bed_files = []
for index, (bam, chromosome) in enumerate(itertools.product(bams, chromosomes)):
process_workdir = os.path.join(workdir, str(index))
if not os.path.isdir(process_workdir):
os.makedirs(process_workdir)
args_list = [bam, chromosome, process_workdir]
kwargs_dict = {"min_avg_base_qual": min_avg_base_qual, "min_mapq": min_mapq, "min_soft_clip": min_soft_clip,
"max_soft_clip": max_soft_clip, "pad": pad, "min_support": min_support,
"min_support_frac": min_support_frac}
pool.apply_async(generate_sc_intervals, args=args_list, kwds=kwargs_dict,
callback=partial(generate_sc_intervals_callback, result_list=bed_files))
pool.close()
pool.join()
func_logger.info("Following BED files will be merged: %s" % (str(bed_files)))
if not bed_files:
func_logger.warn("No intervals generated")
return None
pybedtools.set_tempdir(workdir)
bedtool = pybedtools.BedTool(bed_files[0])
for bed_file in bed_files[1:]:
bedtool = bedtool.cat(pybedtools.BedTool(bed_file), postmerge=False)
bedtool = bedtool.moveto(os.path.join(workdir, "all_intervals.bed"))
func_logger.info("Selecting the top %d intervals based on normalized read support" % max_intervals)
top_intervals_all_cols_file = os.path.join(workdir, "top_intervals_all_cols.bed")
if bedtool.count() <= max_intervals:
bedtool = bedtool.saveas(top_intervals_all_cols_file)
else:
# Sample the top intervals
top_fraction_cutoff = sorted([float(interval.score) / float(interval.fields[6]) for interval in bedtool], reverse=True)[max_intervals-1]
bedtool = bedtool.filter(lambda x: float(x.score) / float(x.fields[6]) >= top_fraction_cutoff).moveto(top_intervals_all_cols_file)
# Filter out the extra column added to simplify life later on
bedtool = bedtool.cut(xrange(6)).saveas(os.path.join(workdir, "top_intervals.bed"))
if skip_bed:
skip_bedtool = pybedtools.BedTool(skip_bed)
func_logger.info(
"Merging %d features with %d features from %s" % (bedtool.count(), skip_bedtool.count(), skip_bed))
bedtool = skip_bedtool.cat(bedtool, postmerge=False).sort()
func_logger.info("After merging with %s %d features" % (skip_bed, bedtool.count()))
bedtool = bedtool.saveas(os.path.join(workdir, "intervals.bed"))
pybedtools.cleanup(remove_all=True)
return bedtool.fn
开发者ID:BioinformaticsArchive,项目名称:metasv,代码行数:79,代码来源:generate_sv_intervals.py
示例18: generate_sc_intervals
def generate_sc_intervals(bam, chromosome, workdir, min_avg_base_qual=SC_MIN_AVG_BASE_QUAL, min_mapq=SC_MIN_MAPQ, min_soft_clip=SC_MIN_SOFT_CLIP,
max_soft_clip=SC_MAX_SOFT_CLIP, pad=SC_PAD, min_support=MIN_SUPPORT, max_isize=1000000000,
min_support_frac=MIN_SUPPORT_FRAC):
func_logger = logging.getLogger("%s-%s" % (generate_sc_intervals.__name__, multiprocessing.current_process()))
if not os.path.isdir(workdir):
func_logger.error("Working directory %s doesn't exist" % workdir)
return None
func_logger.info("Generating candidate insertion intervals from %s for chromsome %s" % (bam, chromosome))
pybedtools.set_tempdir(workdir)
unmerged_intervals = []
start_time = time.time()
try:
sam_file = pysam.Samfile(bam, "rb")
for aln in sam_file.fetch(reference=chromosome):
if abs(aln.tlen) > max_isize:
continue
if not is_good_candidate(aln, min_avg_base_qual=min_avg_base_qual, min_mapq=min_mapq,
min_soft_clip=min_soft_clip, max_soft_clip=max_soft_clip): c
|
请发表评论