• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python pybedtools.cleanup函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中pybedtools.cleanup函数的典型用法代码示例。如果您正苦于以下问题:Python cleanup函数的具体用法?Python cleanup怎么用?Python cleanup使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了cleanup函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: test_cleanup

def test_cleanup():
    """
    make sure the tempdir and cleanup work
    """
    assert os.path.abspath(pybedtools.get_tempdir()) == os.path.abspath('.')

    # make a fake tempfile, not created during this pybedtools session
    testfn = 'pybedtools.TESTING.tmp'
    os.system('touch %s' % testfn)
    assert os.path.exists(testfn)

    # make some temp files
    a = pybedtools.BedTool(os.path.join(testdir, 'data', 'a.bed'))
    b = pybedtools.BedTool(os.path.join(testdir, 'data', 'b.bed'))
    c = a.intersect(b)

    # after standard cleanup, c's fn should be gone but the fake one still
    # there...
    pybedtools.cleanup(verbose=True)
    assert os.path.exists(testfn)
    assert not os.path.exists(c.fn)

    # Unless we force the removal of all temp files.
    pybedtools.cleanup(remove_all=True)
    assert not os.path.exists(testfn)

    # a.fn and b.fn better be there still!
    assert os.path.exists(a.fn)
    assert os.path.exists(b.fn)
开发者ID:Fabrices,项目名称:pybedtools,代码行数:29,代码来源:test_helpers.py


示例2: main

def main():
    """
    Third quick example from the documentation -- count reads introns and
    exons, in parallel
    """
    ap = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
                                 usage=__doc__)
    ap.add_argument('--gff', required=True,
                    help='GFF or GTF file containing annotations')
    ap.add_argument('--bam', required=True,
                    help='BAM file containing reads to be counted')
    ap.add_argument('--stranded', action='store_true',
                    help='Use strand-specific merging and overlap. '
                         'Default is to ignore strand')
    ap.add_argument('--no-parallel', dest='noparallel', action='store_true',
                    help='Disables parallel computation')
    ap.add_argument('-o', '--output',
                    help='Optional file to which results will be written; '
                         'default is stdout')
    ap.add_argument('-v', '--verbose', action='store_true',
                    help='Verbose (goes to stderr)')
    args = ap.parse_args()

    gff = args.gff
    bam = args.bam
    stranded = args.stranded
    parallel = not args.noparallel

    # Some GFF files have invalid entries -- like chromosomes with negative
    # coords or features of length = 0.  This line removes them and saves the
    # result in a tempfile
    g = pybedtools.BedTool(gff).remove_invalid().saveas()

    # Decide which version of map to use.  If parallel, we only need 3
    # processes.
    pool = multiprocessing.Pool(processes=3)

    # Get separate files for introns and exons in parallel (if specified)
    featuretypes = ('intron', 'exon')
    introns, exons = pool.map(subset_featuretypes, featuretypes)

    # Perform some genome algebra to get unique and shared regions
    exon_only = exons.subtract(introns).merge().remove_invalid().saveas()
    intron_only = introns.subtract(exons).merge().remove_invalid().saveas()
    intron_and_exon = exons\
            .intersect(introns).merge().remove_invalid().saveas()

    # Do intersections with BAM file in parallel
    features = (exon_only, intron_only, intron_and_exon)
    results = pool.map(count_reads_in_features, features)

    labels = ('      exon only:',
              '    intron only:',
              'intron and exon:')

    for label, reads in zip(labels, results):
        print('%s %s' % (label, reads))

    pybedtools.cleanup(verbose=False)
开发者ID:PanosFirmpas,项目名称:pybedtools,代码行数:59,代码来源:intron_exon_reads.py


示例3: run_spades_parallel

def run_spades_parallel(bam=None, spades=None, bed=None, work=None, pad=SPADES_PAD, nthreads=1, chrs=[],
                        max_interval_size=SPADES_MAX_INTERVAL_SIZE,
                        timeout=SPADES_TIMEOUT, isize_min=ISIZE_MIN, isize_max=ISIZE_MAX,
                        svs_to_assemble=SVS_ASSEMBLY_SUPPORTED,
                        stop_on_fail=False, max_read_pairs=EXTRACTION_MAX_READ_PAIRS):
    pybedtools.set_tempdir(work)

    logger.info("Running SPAdes on the intervals in %s" % bed)
    if not bed:
        logger.info("No BED file specified")
        return None, None

    bedtool = pybedtools.BedTool(bed)
    total = bedtool.count()

    chrs = set(chrs)
    all_intervals = [interval for interval in bedtool] if not chrs else [interval for interval in bedtool if
                                                                         interval.chrom in chrs]
    selected_intervals = filter(partial(should_be_assembled, max_interval_size=max_interval_size, svs_to_assemble=svs_to_assemble),
                                all_intervals)
    ignored_intervals = filter(partial(shouldnt_be_assembled, max_interval_size=max_interval_size, svs_to_assemble=svs_to_assemble),
                               all_intervals)

    pool = multiprocessing.Pool(nthreads)
    assembly_fastas = []
    for i in xrange(nthreads):
        intervals = [interval for (j, interval) in enumerate(selected_intervals) if (j % nthreads) == i]
        kwargs_dict = {"intervals": intervals, "bam": bam, "spades": spades, "work": "%s/%d" % (work, i), "pad": pad,
                       "timeout": timeout, "isize_min": isize_min, "isize_max": isize_max, "stop_on_fail": stop_on_fail,
                       "max_read_pairs": max_read_pairs}
        pool.apply_async(run_spades_single, kwds=kwargs_dict,
                         callback=partial(run_spades_single_callback, result_list=assembly_fastas))

    pool.close()
    pool.join()

    logger.info("Merging the contigs from %s" % (str(assembly_fastas)))
    assembled_fasta = os.path.join(work, "spades_assembled.fa")
    with open(assembled_fasta, "w") as assembled_fd:
        for line in fileinput.input(assembly_fastas):
            assembled_fd.write("%s\n" % (line.strip()))

    if os.path.getsize(assembled_fasta) > 0:
        logger.info("Indexing the assemblies")
        pysam.faidx(assembled_fasta)
    else:
        logger.error("No assembly generated")
        assembled_fasta = None

    ignored_bed = None
    if ignored_intervals:
        ignored_bed = os.path.join(work, "ignored.bed")
        pybedtools.BedTool(ignored_intervals).each(add_breakpoints).saveas(ignored_bed)

    pybedtools.cleanup(remove_all=True)

    return assembled_fasta, ignored_bed
开发者ID:thongnt2,项目名称:metasv,代码行数:57,代码来源:run_spades.py


示例4: main

def main():
    args = argparser()
    forwardsites = findSites(args.input, args.reference, "+")
    revsites = findSites(args.input, args.reference, "-")
    try:
        with open(args.output,'wb') as of:
            for l in forwardsites + revsites:
                of.write(l + "\n")
    except TypeError:
        for l in forwardsites + revsites:
            args.output.write(l + "\n")
    pybedtools.cleanup(remove_all=True)
开发者ID:adamjorr,项目名称:zypy,代码行数:12,代码来源:degenerate.py


示例5: run_spades_parallel

def run_spades_parallel(bam=None, spades=None, bed=None, work=None, pad=SPADES_PAD, nthreads=1, chrs=[], max_interval_size=50000,
                        timeout=SPADES_TIMEOUT, isize_min=ISIZE_MIN, isize_max=ISIZE_MAX, disable_deletion_assembly=False, stop_on_fail=False):
    pybedtools.set_tempdir(work)

    bedtool = pybedtools.BedTool(bed)
    total = bedtool.count()

    chrs = set(chrs)
    all_intervals = [interval for interval in bedtool] if not chrs else [interval for interval in bedtool if
                                                                         interval.chrom in chrs]
    selected_intervals = filter(partial(should_be_assembled, disable_deletion_assembly=disable_deletion_assembly), all_intervals)
    ignored_intervals = filter(partial(shouldnt_be_assembled, disable_deletion_assembly=disable_deletion_assembly), all_intervals)

    pool = multiprocessing.Pool(nthreads)
    assembly_fastas = []
    for i in xrange(nthreads):
        intervals = [interval for (j, interval) in enumerate(selected_intervals) if (j % nthreads) == i]
        kwargs_dict = {"intervals": intervals, "bam": bam, "spades": spades, "work": "%s/%d" % (work, i), "pad": pad,
                       "timeout": timeout, "isize_min": isize_min, "isize_max": isize_max, "stop_on_fail": stop_on_fail}
        pool.apply_async(run_spades_single, kwds=kwargs_dict,
                         callback=partial(run_spades_single_callback, result_list=assembly_fastas))

    pool.close()
    pool.join()

    logger.info("Merging the contigs from %s" % (str(assembly_fastas)))
    assembled_fasta = os.path.join(work, "spades_assembled.fa")
    with open(assembled_fasta, "w") as assembled_fd:
        for line in fileinput.input(assembly_fastas):
            assembled_fd.write("%s\n" % (line.strip()))

    logger.info("Indexing the assemblies")
    pysam.faidx(assembled_fasta)

    ignored_bed = None
    if ignored_intervals:
        ignored_bed = os.path.join(work, "ignored.bed")
        pybedtools.BedTool(ignored_intervals).each(add_breakpoints).saveas(ignored_bed)

    pybedtools.cleanup(remove_all=True)

    return assembled_fasta, ignored_bed
开发者ID:BioinformaticsArchive,项目名称:metasv,代码行数:42,代码来源:run_spades.py


示例6: GetRatioGenome

def GetRatioGenome():
    pulldown = pysam.Samfile(args.pulldown)
    control = pysam.Samfile(args.control)
    if args.tot_pulldown is not None:
        tot_pulldown = args.tot_pulldown
        tot_control = args.control
    else:
        tot_pulldown = pulldown.mapped + pulldown.unmapped
        tot_control = control.mapped + control.unmapped
    print >>sys.stderr, "Total number of reads in pulldown sample: %d" % (tot_pulldown)
    print >>sys.stderr, "Total number of reads in control sample: %d" % (tot_control)
    # get spike-in read within bed file
    pulldown_bam = pybedtools.BedTool(args.pulldown)
    control_bam = pybedtools.BedTool(args.control)
    spike_pulldown = pulldown_bam.intersect(args.pos,f=0.5).count()
    spike_control = control_bam.intersect(args.pos,f=0.5).count()
    print >>sys.stderr, "Total number of reads mapped in spike-in in pulldown sample: %d" % (spike_pulldown)
    print >>sys.stderr, "Total number of reads mapped in spike-in in control sample: %d" % (spike_control)
    ratio = float(spike_control)/float(spike_pulldown)*float(tot_pulldown)/float(tot_control)
    print >>sys.stderr, "Ratio is %.6f" % (ratio)
    pulldown.close()
    control.close()
    pybedtools.cleanup()
开发者ID:zocean,项目名称:Cytoruler,代码行数:23,代码来源:TSA-seq_spike_in_get_ratio.py


示例7: vcf_to_df_worker

def vcf_to_df_worker(arg):
    """ Convert CANVAS vcf to a dict, single thread
    """
    canvasvcf, exonbed, i = arg
    logging.debug("Working on job {}: {}".format(i, canvasvcf))
    samplekey = op.basename(canvasvcf).split(".")[0].rsplit('_', 1)[0]
    d = {'SampleKey': samplekey}

    exons = BedTool(exonbed)
    cn = parse_segments(canvasvcf)
    overlaps = exons.intersect(cn, wao=True)
    gcn_store = {}
    for ov in overlaps:
        # Example of ov.fields:
        # [u'chr1', u'11868', u'12227', u'ENSG00000223972.5',
        # u'ENST00000456328.2', u'transcribed_unprocessed_pseudogene',
        # u'DDX11L1', u'.', u'-1', u'-1', u'.', u'0']
        gene_name = "|".join((ov.fields[6], ov.fields[3], ov.fields[5]))
        if gene_name not in gcn_store:
            gcn_store[gene_name] = defaultdict(int)

        cn = ov.fields[-2]
        if cn == ".":
            continue
        cn = int(cn)
        if cn > 10:
            cn = 10
        amt = int(ov.fields[-1])
        gcn_store[gene_name][cn] += amt

    for k, v in sorted(gcn_store.items()):
        v_mean, v_median = counter_mean_and_median(v)
        d[k + ".avgcn"] = v_mean
        d[k + ".medcn"] = v_median
    cleanup()
    return d
开发者ID:xuanblo,项目名称:jcvi,代码行数:36,代码来源:cnv.py


示例8: bin_frag

def bin_frag(outdir, bin_bed, all_frag_bed, fragcount_bed):
    logging.info("output directory: %s", outdir)
    logging.info("Bin file: %s", bin_bed)
    logging.info("All restriction fragments bed file: %s", all_frag_bed)
    logging.info("Number of fragdata files: %d", len(fragcount_bed))
    os.mkdir(outdir)

    # open bins file
    bins = pbt.BedTool(bin_bed)
    logging.info("read in %8d bins", len(bins))

    # open all frag file
    all_frag = pbt.BedTool(all_frag_bed)
    logging.info("read in %8d restriction fragments", len(all_frag))

    # match up bins with restriction fragments
    #TODO: stats on the result
    bins_with_any_frag = count_frags_per_bin(bins, all_frag)
    logging.info("bins that contained any fragments: %d", len(bins_with_any_frag))

    make_bedgraph_files(fragcount_bed, bins_with_any_frag, outdir)

    # cleanup
    pbt.cleanup(remove_all = True)
开发者ID:wresch,项目名称:4C,代码行数:24,代码来源:bin.py


示例9: extract_target_genes_transcripts

def extract_target_genes_transcripts(dicoNiourk):
    if dicoNiourk["target"]!="":
        print("\x1b[0;38;2;"+dicoNiourk["color"]["light1"]+"m") ; sys.stdout.write("\033[F")
        dicoNiourk["spinner"].text = "    • Extract target genes"
        dicoNiourk["spinner"].start()
        # Find intersecation between gff and target bed
        bed = BedTool(dicoNiourk["target"])
        genes = BedTool(dicoNiourk["refseq_gff"])
        dicoNiourk["target_gene"] = {} # id to name
        dicoNiourk["target_transcript"] = {} # id to name
        dico_intersect_transcript = {}
        # Search gff exons intresection
        for intersect_elem in genes+bed:
            if intersect_elem.fields[2]=="exon":
                exon = dicoNiourk["db_gff"][intersect_elem.attrs["ID"]]
                # retrieve correspunding transcript
                for rna in dicoNiourk["db_gff"].parents(exon, featuretype='mRNA', order_by='start'):
                    try: dico_intersect_transcript[rna]+=1
                    except: dico_intersect_transcript[rna] = 1
        #***** FILTER transcript which all coding exons in target *****#
        for rna in dico_intersect_transcript:
            # retrieve parent gene
            gene = list(dicoNiourk["db_gff"].parents(rna, featuretype='gene', order_by='start'))[0]
            cds_start = list(dicoNiourk["db_gff"].children(rna, featuretype='CDS', order_by='start'))[0].start
            cds_end = list(dicoNiourk["db_gff"].children(rna, featuretype='CDS', order_by='start'))[-1].end
            # Count coding exon
            nb_coding_exons = 0
            for exon in dicoNiourk["db_gff"].children(rna, featuretype='exon', order_by='start'):
                if exon.end>cds_start and exon.start<cds_end: nb_coding_exons+=1
            # Filtre transcripts and genes
            if dico_intersect_transcript[rna]>=nb_coding_exons:
                dicoNiourk["target_transcript"][rna.attributes["Name"][0]] = rna.id
                for gene in dicoNiourk["db_gff"].parents(rna, featuretype='gene', order_by='start'): dicoNiourk["target_gene"][gene.attributes["Name"][0]] = gene.id
        dicoNiourk["spinner"].stop()
        printcolor("    • "+str(len(dicoNiourk["target_gene"]))+"genes/"+str(len(dicoNiourk["target_transcript"]))+"rnas extracted\n","0",dicoNiourk["color"]["light1"],None,dicoNiourk["color"]["bool"])
        cleanup(remove_all=True) # delete created temp file
开发者ID:dooguypapua,项目名称:Niourk,代码行数:36,代码来源:Niourk_bed_gff.py


示例10: generate_sc_intervals


#.........这里部分代码省略.........

        unmerged_bed = os.path.join(workdir, "unmerged.bed")
        bedtool = pybedtools.BedTool(unmerged_intervals).sort().moveto(unmerged_bed)
        func_logger.info("%d candidate reads" % (bedtool.count()))

        bedtool_lr={"L":bedtool.filter(lambda x: int(x.name.split(",")[0])<=int(x.name.split(",")[1])).sort(),
                    "R":bedtool.filter(lambda x: int(x.name.split(",")[0])>int(x.name.split(",")[1])).sort()}
        bp_merged_intervals = []
        for k_bt,bt in bedtool_lr.iteritems():
            merged_bed = os.path.join(workdir, "merged_%s.bed"%k_bt)
            m_bt=merge_for_each_sv(bt,c="4,5,6,7",o="collapse,sum,collapse,collapse",
                                        svs_to_softclip=svs_to_softclip,d=merge_max_dist,
                                        reciprocal_for_2bp=False, sv_type_field = [6,0])
            m_bt = m_bt.moveto(merged_bed)
            func_logger.info("%d merged intervals with left bp support" % (m_bt.count()))

            # Check if the other break point also can be merged for the merged intervals (for 2bp SVs)
            for interval in m_bt:
                sv_type = interval.fields[6].split(',')[0]
                if len(set(interval.fields[6].split(',')))!=1:
                    func_logger.warn("More than one svtypes: %s",(str(interval)))
                if  sv_type == "INS":
                    bp_merged_intervals.append(interval)
                else:
                    name_fields_0 = interval.name.split(',')
                    other_bps = map(lambda x:int(name_fields_0[3*x+1]), range(len(name_fields_0)/3))
                    if (min(other_bps)+2*pad-max(other_bps))>(-merge_max_dist):
                        bp_merged_intervals.append(interval)
                        continue

                    other_bp_bedtool=bt.filter(lambda x: x.name in interval.name and x.fields[6]==sv_type).each(partial(generate_other_bp_interval,pad=pad)).sort().merge(c="4,5,6,7", o="collapse,sum,collapse,collapse", d=merge_max_dist)
                    if len(other_bp_bedtool)==1:
                        bp_merged_intervals.append(interval)
                    else:
                        for intvl in other_bp_bedtool:
                            bp_merged_intervals.extend(bt.filter(lambda x: x.name in intvl.name and x.fields[6]==sv_type).sort().merge(c="4,5,6,7", o="collapse,sum,collapse,collapse", d=merge_max_dist))
                    
        bp_merged_bed = os.path.join(workdir, "bp_merged.bed")
        bedtool=pybedtools.BedTool(bp_merged_intervals).each(partial(add_other_bp_fields,pad=pad)).sort().moveto(bp_merged_bed)       
        func_logger.info("%d BP merged intervals" % (bedtool.count()))

        filtered_bed = os.path.join(workdir, "filtered.bed")
        bedtool = bedtool.filter(lambda x: int(x.score) >= MIN_SUPPORT_SC_ONLY).each(
            partial(merged_interval_features, bam_handle=sam_file)).moveto(
            filtered_bed)
        func_logger.info("%d filtered intervals" % (bedtool.count()))
        
        # Now filter based on coverage
        coverage_filtered_bed = os.path.join(workdir, "coverage_filtered.bed")
        bedtool = bedtool.filter(lambda x: (x.fields[3].split(",")[1]!="INS" or 
                                           ((min_ins_cov_frac*mean_read_coverage)<=(float(x.fields[6])/abs(x.start-x.end+1)*mean_read_length)<=(max_ins_cov_frac*mean_read_coverage)))).moveto(coverage_filtered_bed)
        func_logger.info("%d coverage filtered intervals" % (bedtool.count()))


        thr_sv={"INS":min_support_frac_ins, "INV":MIN_SUPPORT_FRAC_INV, 
                "DEL":MIN_SUPPORT_FRAC_DEL, "DUP": MIN_SUPPORT_FRAC_DUP}

        # Add number of neighbouring reads that support SC
        bedtool=bedtool.each(partial(add_neighbour_support,bam_handle=sam_file, min_mapq=min_mapq, 
                                     min_soft_clip=min_soft_clip, max_nm=max_nm, min_matches=min_matches,
                                     skip_soft_clip=False, isize_mean=isize_mean, min_isize=min_isize, max_isize=max_isize)).sort().moveto(coverage_filtered_bed)

        neigh_coverage_filtered_bed = os.path.join(workdir, "neigh_filtered.bed")
        bedtool = bedtool.each(partial(filter_low_frac_support,thr_sv=thr_sv)).each(
                               partial(filter_low_neigh_read_support_INS,min_support_ins=min_support_ins)).sort().moveto(
                               neigh_coverage_filtered_bed)
        func_logger.info("%d neighbour support filtered intervals" % (bedtool.count()))

        # For 2bp SVs, the interval will be the cover of two intervals on the BP
        full_filtered_bed = os.path.join(workdir, "full_neigh_filtered.bed")
        bedtool = bedtool.each(partial(get_full_interval,pad=pad)).sort().moveto(full_filtered_bed)
        func_logger.info("%d full filtered intervals" % (bedtool.count()))

        # Now merge on full intervals
        merged_full_filtered_bed = os.path.join(workdir, "merged_full.bed")
        if bedtool.count()>0:
            bedtool=merge_for_each_sv(bedtool,c="4,5,6,7,9",o="collapse,collapse,collapse,collapse,collapse",
                                      svs_to_softclip=svs_to_softclip,
                                      overlap_ratio=overlap_ratio,
                                      reciprocal_for_2bp=True, 
                                      sv_type_field = [3,1], d=merge_max_dist)
        bedtool = bedtool.each(partial(fix_merged_fields,inter_tools=False)).each(partial(fine_tune_bps,pad=pad))
        bedtool = bedtool.filter(lambda x: x.score != "-1").sort().moveto(merged_full_filtered_bed)
        func_logger.info("%d merged full intervals" % (bedtool.count()))
        
        sam_file.close()
    except Exception as e:
        func_logger.error('Caught exception in worker thread')

        # This prints the type, value, and stack trace of the
        # current exception being handled.
        traceback.print_exc()

        print()
        raise e

    pybedtools.cleanup(remove_all=True)
    func_logger.info("Generated intervals in %g seconds for region %s" % ((time.time() - start_time), chromosome))

    return merged_full_filtered_bed
开发者ID:chapmanb,项目名称:metasv,代码行数:101,代码来源:generate_sv_intervals.py


示例11: teardown

def teardown():
    pybedtools.cleanup(remove_all=True)
开发者ID:PanosFirmpas,项目名称:pybedtools,代码行数:2,代码来源:test_iter.py


示例12: run_age_parallel

def run_age_parallel(intervals_bed=None, reference=None, assembly=None, pad=AGE_PAD, age=None, age_workdir=None,
                     timeout=AGE_TIMEOUT, keep_temp=False, assembly_tool="spades", chrs=[], nthreads=1,
                     min_contig_len=AGE_MIN_CONTIG_LENGTH,
                     max_region_len=AGE_MAX_REGION_LENGTH, sv_types=[], 
                     min_del_subalign_len=MIN_DEL_SUBALIGN_LENGTH, min_inv_subalign_len=MIN_INV_SUBALIGN_LENGTH,
                     age_window = AGE_WINDOW_SIZE):
    func_logger = logging.getLogger("%s-%s" % (run_age_parallel.__name__, multiprocessing.current_process()))

    if not os.path.isdir(age_workdir):
        func_logger.info("Creating %s" % age_workdir)
        os.makedirs(age_workdir)

    if assembly:
        if not os.path.isfile("%s.fai" % assembly):
            func_logger.info("Assembly FASTA wasn't indexed. Will attempt to index now.")
            pysam.faidx(assembly)

        func_logger.info("Loading assembly contigs from %s" % assembly)
        with open(assembly) as assembly_fd:
            if assembly_tool == "spades":
                contigs = [SpadesContig(line[1:]) for line in assembly_fd if line[0] == '>']
            elif assembly_tool == "tigra":
                contigs = [TigraContig(line[1:]) for line in assembly_fd if line[0] == '>']
    else:
        contigs = []

    chrs = set(chrs)
    sv_types = set(sv_types)
    contig_dict = {contig.sv_region.to_tuple(): [] for contig in contigs if (len(
        chrs) == 0 or contig.sv_region.chrom1 in chrs) and contig.sequence_len >= min_contig_len and contig.sv_region.length() <= max_region_len and (
                       len(sv_types) == 0 or contig.sv_type in sv_types)}

    func_logger.info("Generating the contig dictionary for parallel execution")
    small_contigs_count = 0
    for contig in contigs:
        if contig.sv_region.length() > max_region_len: 
            func_logger.info("Too large SV region length: %d > %d" % (contig.sv_region.length(),max_region_len))
            continue
        if (len(chrs) == 0 or contig.sv_region.chrom1 in chrs) and (len(sv_types) == 0 or contig.sv_type in sv_types):
            if contig.sequence_len >= min_contig_len:
                contig_dict[contig.sv_region.to_tuple()].append(contig)
            else:
                small_contigs_count += 1

    region_list = sorted(contig_dict.keys())
    nthreads = min(nthreads, len(region_list))

    if nthreads == 0:
        func_logger.warning("AGE not run since no contigs found")
        return None

    func_logger.info("Will process %d regions with %d contigs (%d small contigs ignored) using %d threads" % (
        len(region_list), sum([len(value) for value in contig_dict.values()]), small_contigs_count, nthreads))

    pybedtools.set_tempdir(age_workdir)
    pool = multiprocessing.Pool(nthreads)

    breakpoints_beds = []
    for i in xrange(nthreads):
        region_sublist = [region for (j, region) in enumerate(region_list) if (j % nthreads) == i]
        kwargs_dict = {"intervals_bed": intervals_bed, "region_list": region_sublist, "contig_dict": contig_dict,
                       "reference": reference, "assembly": assembly, "pad": pad, "age": age, "age_workdir": age_workdir,
                       "timeout": timeout, "keep_temp": keep_temp, "myid": i, 
                       "min_del_subalign_len": min_del_subalign_len, "min_inv_subalign_len": min_inv_subalign_len,
                       "age_window" : age_window}
        pool.apply_async(run_age_single, args=[], kwds=kwargs_dict,
                         callback=partial(run_age_single_callback, result_list=breakpoints_beds))

    pool.close()
    pool.join()

    func_logger.info("Finished parallel execution")

    func_logger.info("Will merge the following breakpoints beds %s" % (str(breakpoints_beds)))

    pybedtools.cleanup(remove_all=True)

    if not breakpoints_beds:
        return None

    bedtool = pybedtools.BedTool(breakpoints_beds[0])
    for bed_file in breakpoints_beds[1:]:
        bedtool = bedtool.cat(pybedtools.BedTool(bed_file), postmerge=False)

    bedtool = bedtool.moveto(os.path.join(age_workdir, "breakpoints_unsorted.bed"))
    merged_bed = os.path.join(age_workdir, "breakpoints.bed")
    bedtool.sort().saveas(merged_bed)

    return merged_bed
开发者ID:sbandara,项目名称:metasv,代码行数:89,代码来源:run_age.py


示例13: main


#.........这里部分代码省略.........
            sys.stderr.write("%s, RAver %s not in the genome database!\n" %
                             (genome, rv))
            return 1

    # create a tmp bed file with index column.
    in_f = file(input_file_name)
    # filter the comment lines
    input_filtered = [
        line for line in in_f if not (line.lstrip().startswith("#") or len(line.strip())==0)]
    # if there is header, store it and remove it from the query BED.
    if rhead:
        headlineL = input_filtered[0].strip().split("\t")
        del input_filtered[0]
    # add index column to the bed lines
    input_indexed = ['%s\t%d\n' % (line.strip(), i)
                     for i, line in enumerate(input_filtered)]
    in_f.close()

    # read all annotations into a dictionary, for the further output.
    anno_bed = os.path.join(
        db_path, genome + "." + anno_db + ".biotype_region_ext.bed")
    try:
        if not os.path.exists(anno_bed):
            raise SystemExit
    except SystemExit:
        sys.stderr.write("%s genome not properly installed!\n" % genome)
        return 1

    # use saveas() to convert the BedTool objects to file-based objects,
    # so they could be used multiple times.
    # When debug, we may use saveas("tss.tmp"), and the output of bedtools
    # could be saved.
    pybedtools.set_tempdir("./")
    anno = pybedtools.BedTool(anno_bed).saveas()
    gd = pybedtools.BedTool(
        os.path.join(db_path, genome + "_geneDesert.bed")).saveas()
    pc = pybedtools.BedTool(
        os.path.join(db_path, genome + "_pericentromere.bed")).saveas()
    st = pybedtools.BedTool(
        os.path.join(db_path, genome + "_subtelomere.bed")).saveas()

    # load the input intervals to be annotated.
    try:
        input_bed = pybedtools.BedTool(
            "".join(input_indexed), from_string=True).saveas()
    except:
        sys.stderr.write("Error in input file! Please check the format!")
        return 1
    list_input = [x.fields[:] for x in input_bed]
    col_no_input = input_bed.field_count()
    # get the midpoint of the intervals.
    # there is a bug in midpoint function of pybedtools 0.6.3, so here an alternative function was used.
    # input_bed_mid = input_bed.each(pybedtools.featurefuncs.midpoint).saveas()
    input_bed_mid = pybedtools.BedTool(
        "".join([regionanalysis.analysis.midpoint(x) for x in input_indexed]), from_string=True).saveas()

    # intersectBed with annotations.
    input_GB = input_bed_mid.intersect(anno, wao=True).saveas()
    list_GB = [x.fields[:] for x in input_GB]
    input_gd = input_bed_mid.intersect(gd, c=True, f=0.5).saveas()
    list_gd = [x.fields[col_no_input + 0] for x in input_gd]
    input_pc = input_bed_mid.intersect(pc, c=True, f=0.5).saveas()
    list_pc = [x.fields[col_no_input + 0] for x in input_pc]
    input_st = input_bed_mid.intersect(st, c=True, f=0.5).saveas()
    list_st = [x.fields[col_no_input + 0] for x in input_st]

    # groupby the intersectBed results based on the index column.
    input_idx = key = lambda s: s[col_no_input - 1]
    GB_dict = {}
    for key, GB_hits in groupby(list_GB, key=input_idx):
        GB_dict[key] = list(v for v in GB_hits)

    output_file_best = file(input_file_name + ".annotated", "w")
    output_file = file(input_file_name + ".full.annotated", "w")
    output_file_json = file(input_file_name + ".full.annotated.json", "w")
    # Output the header.
    if rhead:
        output_file.write("\t".join(
            headlineL + ["GName", "TName", "Strand", "TSS", "TES", "Feature", "D2TSS", "Biotype", "GeneSymbol"]) + "\n")
        output_file_best.write("\t".join(
            headlineL + ["GName", "TName", "Strand", "TSS", "TES", "Feature", "D2TSS", "Biotype", "GeneSymbol"]) + "\n")
    # write to the output: input.bed.annotated, input.bed.full.annotated.
    json_dict = {}
    for i in range(0, len(input_bed)):
        output_lineL = list_input[i][:-1]  # original input line
        json_dict[str(i)] = {}
        json_dict[str(i)]["query_interval"] = output_lineL
        formatted, best_hit = regionanalysis.analysis.getBestHit(
            anno_db, col_no_input, GB_dict[str(i)], list_gd[i], list_st[i], list_pc[i])
        output_file_best.write("\t".join(output_lineL + best_hit) + "\n")
        json_dict[str(i)]["best_hit"] = best_hit
        for j in formatted:
            output_file.write("\t".join(output_lineL + j) + "\n")
        json_dict[str(i)]["all_hits"] = formatted
    output_file_best.close()
    output_file.close()
    json.dump(json_dict, output_file_json, sort_keys=True, indent=2)
    output_file_json.close()
    pybedtools.cleanup()
    return 0
开发者ID:Johan35,项目名称:region_analysis,代码行数:101,代码来源:region_analysis.py


示例14:

import pybedtools

pybedtools.set_tempdir('.')
a = pybedtools.bedtool('a.bed')
a.intersect(a)
pybedtools.cleanup(verbose=True)
开发者ID:radaniba,项目名称:pybedtools,代码行数:6,代码来源:test1.py


示例15: teardown_module

def teardown_module():
    if os.path.exists(test_tempdir):
        os.system('rm -r %s' % test_tempdir)
    pybedtools.cleanup()
开发者ID:daler,项目名称:pybedtools,代码行数:4,代码来源:tfuncs.py


示例16: mergeVCFfiles

def mergeVCFfiles(chrom, A, B, file1, file2):

    ## Write headers to a file. Needed later for creating a correct VCF of the intersected files. 
    header1 = gzip.open((file1+chrom+'.vcf.gz'), 'r')
    header2 = gzip.open((file2+chrom+'.vcf.gz'), 'r')

    headerFile = (path+'HEADER_'+A+'_'+B+'_.vcf.gz')
    f = gzip.open(headerFile, 'ab')

    for line in header1:
        if line[0:2] != '##':
            break;
        f.write(line)
    header1.close()

    for line in header2:
        if line[0:2] != '##':
            break;
        f.write(line)
    header1.close()

    f.write('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO');
    f.close()

    ## Intersects files -LOJ

    a = BedTool((file1+chrom+'.vcf.gz'))
    b = BedTool((file2+chrom+'.vcf.gz'))

    a_b = a.intersect(b, header=True,loj=True)
    ab  = a_b.saveas((path+'LOJ_'+A+'_'+B+'_chr'+chrom+'.vcf.gz'))

    print (ab.fn)
    cleanup(remove_all=True)

    ## Intersects files -V

    a = BedTool((file1+chrom+'.vcf.gz'))
    b = BedTool((file2+chrom+'.vcf.gz'))

    b_a = b.intersect(a, header=True, v=True)
    ba  = b_a.saveas((path+'V_'+A+'_'+B+'_chr'+chrom+'.vcf.gz'))

    print (ba.fn)
    cleanup(remove_all=True)


    ## CAT LOJ file and -v File

    LOJ = path+'LOJ_'+A+'_'+B+'_chr'+chrom+'.vcf.gz'
    V = path+'V_'+A+'_'+B+'_chr'+chrom+'.vcf.gz'
    out = path+'concated_'+A+'_'+B+'_chr'+chrom+'.vcf.gz'

    os.system(("vcf-concat "+LOJ+" "+V+" | gzip -c > "+out))
    #print "ok"



    ## correct to vcf, merge equal type samples
    out2 = 'stage2/unsorted_'+A+'_'+B+'_chr'+chrom+'.vcf.gz'

    import correctToVCF
    goVCF = correctToVCF.CorrectToVCF()
    goVCF.writeHeader(headerFile, out2)
    goVCF.correctFile(out, out2)

    ## sort the VCF file

    os.system(("vcf-sort "+out2+"| gzip -c > "+("germlineVCF/"+A+'_'+B+'_chr'+chrom+'.vcf.gz')))

    cleanup(remove_all=True)
开发者ID:oyvisorb,项目名称:thesis,代码行数:71,代码来源:mergeGermlineVCFfiles.py


示例17: parallel_generate_sc_intervals

def parallel_generate_sc_intervals(bams, chromosomes, skip_bed, workdir, num_threads=1, min_avg_base_qual=SC_MIN_AVG_BASE_QUAL,
                                   min_mapq=SC_MIN_MAPQ, min_soft_clip=SC_MIN_SOFT_CLIP, max_soft_clip=SC_MAX_SOFT_CLIP, pad=SC_PAD,
                                   min_support=MIN_SUPPORT, min_support_frac=MIN_SUPPORT_FRAC, max_intervals=MAX_INTERVALS):
    func_logger = logging.getLogger(
        "%s-%s" % (parallel_generate_sc_intervals.__name__, multiprocessing.current_process()))

    if not os.path.isdir(workdir):
        func_logger.info("Creating directory %s" % workdir)
        os.makedirs(workdir)

    if not chromosomes:
        func_logger.info("Chromosome list unspecified. Inferring from the BAMs")
        for bam in bams:
            bamfile = pysam.Samfile(bam, "rb")
            chromosomes += list(bamfile.references)
            bamfile.close()
        chromosomes = sorted(list(set(chromosomes)))
        func_logger.info("Chromosome list inferred as %s" % (str(chromosomes)))

    if not chromosomes:
        func_logger.error("Chromosome list empty")
        return None

    pool = multiprocessing.Pool(num_threads)

    bed_files = []
    for index, (bam, chromosome) in enumerate(itertools.product(bams, chromosomes)):
        process_workdir = os.path.join(workdir, str(index))
        if not os.path.isdir(process_workdir):
            os.makedirs(process_workdir)

        args_list = [bam, chromosome, process_workdir]
        kwargs_dict = {"min_avg_base_qual": min_avg_base_qual, "min_mapq": min_mapq, "min_soft_clip": min_soft_clip,
                       "max_soft_clip": max_soft_clip, "pad": pad, "min_support": min_support,
                       "min_support_frac": min_support_frac}
        pool.apply_async(generate_sc_intervals, args=args_list, kwds=kwargs_dict,
                         callback=partial(generate_sc_intervals_callback, result_list=bed_files))

    pool.close()
    pool.join()

    func_logger.info("Following BED files will be merged: %s" % (str(bed_files)))

    if not bed_files:
        func_logger.warn("No intervals generated")
        return None

    pybedtools.set_tempdir(workdir)
    bedtool = pybedtools.BedTool(bed_files[0])

    for bed_file in bed_files[1:]:
        bedtool = bedtool.cat(pybedtools.BedTool(bed_file), postmerge=False)

    bedtool = bedtool.moveto(os.path.join(workdir, "all_intervals.bed"))

    func_logger.info("Selecting the top %d intervals based on normalized read support" % max_intervals)
    top_intervals_all_cols_file = os.path.join(workdir, "top_intervals_all_cols.bed")
    if bedtool.count() <= max_intervals:
        bedtool = bedtool.saveas(top_intervals_all_cols_file)
    else:
        # Sample the top intervals
        top_fraction_cutoff = sorted([float(interval.score) / float(interval.fields[6]) for interval in bedtool], reverse=True)[max_intervals-1]
        bedtool = bedtool.filter(lambda x: float(x.score) / float(x.fields[6]) >= top_fraction_cutoff).moveto(top_intervals_all_cols_file)

    # Filter out the extra column added to simplify life later on
    bedtool = bedtool.cut(xrange(6)).saveas(os.path.join(workdir, "top_intervals.bed"))

    if skip_bed:
        skip_bedtool = pybedtools.BedTool(skip_bed)
        func_logger.info(
            "Merging %d features with %d features from %s" % (bedtool.count(), skip_bedtool.count(), skip_bed))
        bedtool = skip_bedtool.cat(bedtool, postmerge=False).sort()
        func_logger.info("After merging with %s %d features" % (skip_bed, bedtool.count()))

    bedtool = bedtool.saveas(os.path.join(workdir, "intervals.bed"))

    pybedtools.cleanup(remove_all=True)

    return bedtool.fn
开发者ID:BioinformaticsArchive,项目名称:metasv,代码行数:79,代码来源:generate_sv_intervals.py


示例18: generate_sc_intervals

def generate_sc_intervals(bam, chromosome, workdir, min_avg_base_qual=SC_MIN_AVG_BASE_QUAL, min_mapq=SC_MIN_MAPQ, min_soft_clip=SC_MIN_SOFT_CLIP,
                          max_soft_clip=SC_MAX_SOFT_CLIP, pad=SC_PAD, min_support=MIN_SUPPORT, max_isize=1000000000,
                          min_support_frac=MIN_SUPPORT_FRAC):
    func_logger = logging.getLogger("%s-%s" % (generate_sc_intervals.__name__, multiprocessing.current_process()))

    if not os.path.isdir(workdir):
        func_logger.error("Working directory %s doesn't exist" % workdir)
        return None

    func_logger.info("Generating candidate insertion intervals from %s for chromsome %s" % (bam, chromosome))
    pybedtools.set_tempdir(workdir)

    unmerged_intervals = []
    start_time = time.time()
    try:
        sam_file = pysam.Samfile(bam, "rb")
        for aln in sam_file.fetch(reference=chromosome):
            if abs(aln.tlen) > max_isize:
                continue
            if not is_good_candidate(aln, min_avg_base_qual=min_avg_base_qual, min_mapq=min_mapq,
                                     min_soft_clip=min_soft_clip, max_soft_clip=max_soft_clip): c 

鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python pybedtools.create_interval_from_list函数代码示例发布时间:2022-05-25
下一篇:
Python util.get_pybb_profile函数代码示例发布时间:2022-05-25
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap