• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python io.read_sequences函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中nesoni.io.read_sequences函数的典型用法代码示例。如果您正苦于以下问题:Python read_sequences函数的具体用法?Python read_sequences怎么用?Python read_sequences使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了read_sequences函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: build_snpeff

 def build_snpeff(self):
     jar = io.find_jar('snpEff.jar')
     
     with open(self/'snpeff.config','wb') as f:
         print >> f, 'data_dir = snpeff'
         print >> f, 'genomes : ' + self.name
         print >> f, self.name + '.genome : ' + self.name 
     
     snpwork = io.Workspace(self/'snpeff',must_exist=False)
     snpwork_genome = io.Workspace(snpwork/self.name,must_exist=False)
     snpwork_genomes = io.Workspace(snpwork/'genomes',must_exist=False)
     
     annotations = self.annotations_filename()
     assert annotations
     with open(snpwork_genome/'genes.gff','wb') as f:
         for record in annotation.read_annotations(annotations):
             if record.end <= record.start: continue
             if not record.attr:
                 record.attr['attributes'] = 'none'
             print >> f, record.as_gff()
     
     with open(snpwork_genomes/(self.name+'.fa'),'wb') as f:
         for name, seq in io.read_sequences(self.reference_fasta_filename()):
             io.write_fasta(f, name, seq)
             
     io.execute('java -jar JAR build NAME -gff3 -c CONFIG',
         JAR=jar, NAME=self.name, CONFIG=self/'snpeff.config')
开发者ID:Victorian-Bioinformatics-Consortium,项目名称:nesoni,代码行数:27,代码来源:reference_directory.py


示例2: run

    def run(self):
        extractions = [ ]
        for item in self.genes.split(','):
            extraction = item.split('/')
            assert len(extraction) == 4
            extractions.append(extraction)
            
        rename = { }
        if self.rename:
            for item in self.rename.split(','):
                old,new = item.split('=')
                rename[old] = new

        work = self.get_workspace()        
        
        with workspace.tempspace() as temp:
            items = list(annotation.read_annotations(self.annotation))
            for item in items:
                item.seqid = rename.get(item.seqid, item.seqid)
            annotation.write_gff3(temp/'temp.gff', get_genes(items, extractions, self.log))
            del items
            
            with open(temp/'temp.fa','wb') as f:
                for name,seq in io.read_sequences(self.genome):
                    name = name.split()[0]
                    name = rename.get(name,name)
                    io.write_fasta(f, name, seq)
            
            reference_directory.Make_tt_reference(
                self.output_dir,
                filenames = [ temp/'temp.fa', temp/'temp.gff' ] + self.extra,
                index = self.index, shrimp = self.shrimp, 
                bowtie = self.bowtie, star = self.star
                ).run()
开发者ID:Victorian-Bioinformatics-Consortium,项目名称:tail-tools,代码行数:34,代码来源:reference_directory_ensembl.py


示例3: make_file_for_primer_3

def make_file_for_primer_3 (gff_file, ref_file, names_file, output_file):
    # check for a tmp direcory 
    if len(glob.glob("./tmp")) == 0:
        call (["mkdir", "tmp"])

    gff_file = list(annotation.read_annotations(gff_file))
    print "\n Reading in the reference file"
    seq_dict = dict(io.read_sequences(ref_file))
    names_file = open(names_file).readlines()
    config  = open("primer_config.txt").readlines()

    with open("tmp/regions_" + output_file, 'w') as out_f:
        for name in names_file:
            sname = name.strip("\n")
            found = False
            for line in gff_file:
                gff_name = line.attr.get ("Name", "No_name")
                peak = line.attr.get ("id", "No_id")
                if sname in gff_name.split("/"):
                    out_f.write ("SEQUENCE_ID="+ gff_name.replace("/", "_") + "_" + peak + "\n")
                    out_f.write("SEQUENCE_TEMPLATE=" + line.shifted(-100, 0).get_seq(seq_dict) + "\n")
                    found = True
                    for cline in config:
                        out_f.write(cline.strip("\n")  + "\n")
                    out_f.write("="  + "\n")
            if found ==False:
                print "Could not find the gene " + sname + " in the gff file"
开发者ID:AndrewPattison,项目名称:Python-scripts,代码行数:27,代码来源:get_primers.py


示例4: main

def main(args):
    size, args = grace.get_option_value(args,'--size',int,200)
    stride, args = grace.get_option_value(args,'--stride',int,50)
    grace.expect_no_further_options(args)
    
    if not args:
        print USAGE
        return 1
    
    for filename in args:
        for name, seq in io.read_sequences(filename):
            name_parts = name.split(None, 1)
            name = name_parts[0]
            if len(name_parts) > 1:
               desc = ' ' + name_parts[1]
            else:
               desc = ''
            
            for i in xrange(-size+stride,len(seq),stride):
                start = max(0,min(len(seq),i))
                end = max(0,min(len(seq), i+size))
                io.write_fasta(
                    sys.stdout,
                    '%s:%d..%d' % (name,start+1,end) + desc,
                    seq[start:end]
                )
    
    return 0
开发者ID:Slugger70,项目名称:nesoni,代码行数:28,代码来源:shred.py


示例5: run

    def run(self):
        f = self.begin_output()
    
        for filename in self.filenames:
            any = False
            
            name = os.path.splitext(os.path.split(filename)[1])[0]
            
            try:
               iterator = io.read_sequences(filename, qualities=True)
            except grace.Error:
               iterator = None
               
            if iterator is not None:
                total = 0
                total_length = 0
                for seq in io.read_sequences(filename, qualities=True):
                    total += 1
                    total_length += len(seq[1])
                print >> f, grace.datum(name, 'sequences', total)
                print >> f, grace.datum(name, 'average length', float(total_length)/total)
                print >> f
                any = True
            
            try:
                iterator = annotation.read_annotations(filename)
            except grace.Error:
                iterator = None
                
            if iterator:
                total = 0
                counts = { }
                for item in iterator:
                    total += 1
                    counts[item.type] = counts.get(item.type,0)+1
                                
                print >> f, grace.datum(name, 'features', total)
                for key in sorted(counts):
                    print >> f, grace.datum(name, key + ' features', counts[key])
                print >> f
                any = True
            
            if not any:
                raise grace.Error(filename + ' is neither a sequence file nor an annotation file that nesoni can read.')

        self.end_output(f)
开发者ID:Slugger70,项目名称:nesoni,代码行数:46,代码来源:trivia.py


示例6: run

    def run(self):
        base = os.path.split(self.prefix)[1]
        
        annotations = [ ]
        sequences = [ ]
        
        for filename in self.filenames:
            any = False
            if io.is_sequence_file(filename):
                sequences.append(filename)
                any = True
            if annotation.is_annotation_file(filename):
                annotations.append(filename)
                any = True
            assert any, 'File is neither a recognized sequence or annotation file'

        cytoband_filename = os.path.join(self.prefix,base+'_cytoband.txt')
        property_filename = os.path.join(self.prefix,'property.txt')
        gff_filename = os.path.join(self.prefix,base+'.gff')
        output_filenames = [ cytoband_filename, property_filename, gff_filename ] 

        if not os.path.exists(self.prefix):
            os.mkdir(self.prefix)
            
        f = open(property_filename,'wb')
        print >> f, 'ordered=true'
        print >> f, 'id=%s' % base
        print >> f, 'name=%s' % (self.name or base)
        print >> f, 'cytobandFile=%s_cytoband.txt' % base
        print >> f, 'geneFile=%s.gff' % base
        print >> f, 'sequenceLocation=%s' % base
        f.close()
        
        trivia.As_gff(output=gff_filename,
               filenames=annotations,
               exclude=[ 'gene', 'source' ]
        ).run()
        
        f_cyt = open(cytoband_filename,'wb')
        for filename in sequences:
            for name, seq in io.read_sequences(filename):
                assert '/' not in name
                f = open(os.path.join(self.prefix, name + '.txt'), 'wb')
                f.write(seq)
                f.close()
                print >> f_cyt, '%s\t0\t%d' % (name, len(seq))
        f_cyt.close()
        
        genome_filename = self.prefix + '.genome'
        if os.path.exists(genome_filename):
            os.unlink(genome_filename)
        io.execute(
            ['zip', '-j', io.abspath(genome_filename)] +
            [ io.abspath(item) for item in output_filenames ]
        )
        for filename in output_filenames:
            if os.path.exists(filename):
                os.unlink(filename)
开发者ID:Slugger70,项目名称:nesoni,代码行数:58,代码来源:igv.py


示例7: get_lengths

 def get_lengths(self):
     #Legacy working directory
     if not self.object_exists('reference-lengths.pickle.gz'):
         lengths = [ ]
         for name, seq in io.read_sequences(self.reference_fasta_filename()):
             name = name.split()[0]
             lengths.append( (name, len(seq)) )
         self.set_object(lengths, 'reference-lengths.pickle.gz')
             
     return self.get_object('reference-lengths.pickle.gz')
开发者ID:Slugger70,项目名称:nesoni,代码行数:10,代码来源:reference_directory.py


示例8: convert

 def convert(filename):
     info = io.get_file_info(filename)
     ok = selection.matches('type-fastq:[compression-none/compression-gzip/compression-bzip2]', info)
     if ok:
         return filename            
     result_name = tempname()
     with open(result_name,'wb') as f:
         for name, seq, qual in io.read_sequences(filename, qualities='required'):
             io.write_fastq(f, name, seq, qual)
     return result_name
开发者ID:Puneet-Shivanand,项目名称:nesoni,代码行数:10,代码来源:bowtie.py


示例9: run

    def run(self):
        f = self.begin_output()
    
        for filename in self.filenames:
            info = io.get_file_info(filename)
            
            any = False
            
            name = os.path.splitext(os.path.split(filename)[1])[0]
            
            if info.matches('sequences'):
                total = 0
                total_length = 0
                for seq in io.read_sequences(filename, qualities=True):
                    total += 1
                    total_length += len(seq[1])
                print >> f, grace.datum(name, 'sequences', total)
                print >> f, grace.datum(name, 'total bases', total_length)
                if total:
                    print >> f, grace.datum(name, 'average length', float(total_length)/total)
                print >> f
                any = True
            
            if info.matches('annotations'):
                total = 0
                counts = { }
                for item in annotation.read_annotations(filename):
                    total += 1
                    counts[item.type] = counts.get(item.type,0)+1
                                
                print >> f, grace.datum(name, 'features', total)
                for key in sorted(counts):
                    print >> f, grace.datum(name, key + ' features', counts[key])
                print >> f
                any = True
            
            if info.matches('type-vcf'):
                reader_f = io.open_possibly_compressed_file(filename)
                reader = vcf.Reader(reader_f)
                n = 0
                for item in reader:
                    n += 1
                print >> f, grace.datum(name, 'variants', n)
                any = True
            
            if not any:
                raise grace.Error('Don\'t know what to do with ' + filename)

        self.end_output(f)
开发者ID:drpowell,项目名称:nesoni,代码行数:49,代码来源:trivia.py


示例10: iter_reads

def iter_reads(config, qualities=False):
    if 'stride' not in config:
        raise grace.Error('Please re-run nesoni shrimp, output format has changed')

    stride = config['stride']
    for reads_filename_set in config['reads']:
        if config['solid']:
            reader = [ io.read_solid(filename) for filename in reads_filename_set ]
        else:
            reader = [ io.read_sequences(filename, qualities) for filename in reads_filename_set ]
        reader = itertools.izip(*reader)
        
        for i, items in enumerate(reader):
            if i % stride == 0: 
               for item in items: 
                   yield item
开发者ID:Puneet-Shivanand,项目名称:nesoni,代码行数:16,代码来源:shrimp.py


示例11: run

    def run(self):
        workspace = self.get_workspace()
        
        reference = reference_directory.Reference(self.reference, must_exist=True)
        
        reader_f = io.open_possibly_compressed_file(self.vcf)
        reader = vcf.Reader(reader_f)
        variants = collections.defaultdict(list)
        for record in reader:
            variants[record.CHROM].append(record)
        reader_f.close()
        
        for chrom in variants:
            variants[chrom].sort(key=lambda item: item.POS)
        
        filenames = [ workspace/(item+'.fa') for item in reader.samples ]
        for filename in filenames:
            with open(filename,'wb'): pass
        
        for name, seq in io.read_sequences(reference.reference_fasta_filename()):
            for i, sample in enumerate(reader.samples):            
                revised = [ ]
                pos = 0
                for variant in variants[name]:
                    gt = variant.samples[i].data.GT
                    if gt is None: continue
                    assert gt.isdigit(), 'Unsupported genotype (can only use haploid genotypes): '+gt
                    gt_number = int(gt)
                    if gt_number == 0:
                        var_seq = variant.REF
                    else:
                        var_seq = str(variant.ALT[gt_number-1])
                        assert re.match('[ACGTN]*$', var_seq), 'Unsupported variant type: '+var_seq
                    new_pos = variant.POS-1
                    assert new_pos >= pos, 'Variants overlap.'
                    revised.append(seq[pos:new_pos])
                    pos = new_pos
                    revised.append(var_seq)
                    assert seq[pos:pos+len(variant.REF)].upper() == variant.REF, 'REF column in VCF does not match reference sequence'
                    pos += len(variant.REF)
                revised.append(seq[pos:])
                            
                with open(filenames[i],'ab') as f:
                    io.write_fasta(f, name, ''.join(revised))

            del variants[name]        
        assert not variants, 'Chromosome names in VCF not in reference: '+' '.join(variants)
开发者ID:simonalpha,项目名称:nesoni,代码行数:47,代码来源:variant.py


示例12: run

    def run(self):    
        f = self.begin_output()
    
        for filename in self.filenames:
            for name, seq in io.read_sequences(filename):
                name_parts = name.split(None, 1)
                name = name_parts[0]
                for i in xrange(-self.size+self.stride,len(seq),self.stride):
                    start = max(0,min(len(seq),i))
                    end = max(0,min(len(seq), i+self.size))
                    shred_name = '%s:%d..%d' % (name,start+1,end)
                    shred_seq = seq
                    if self.quality:
                        io.write_fastq(f, shred_name, seq[start:end], chr(33+self.quality)*(end-start))
                    else:
                        io.write_fasta(f, shred_name, seq[start:end])

        self.end_output(f)
开发者ID:Puneet-Shivanand,项目名称:nesoni,代码行数:18,代码来源:shred.py


示例13: debias

def debias(args):
    import numpy

    radius, args = grace.get_option_value(args, '--radius', int, 2) 

    dirs = args
    
    for dir_name in dirs:
        for name, seq in io.read_sequences(os.path.join(dir_name,'reference.fa')):
            for suffix, ambig_suffix in [
                ('-depth', '-ambiguous-depth'),
                ('-pairspan-depth', '-ambiguous-pairspan-depth'),
            ]:
                root = grace.filesystem_friendly_name(name)
                full_name = os.path.join(dir_name, root + suffix + '.userplot')
                full_ambig_name = os.path.join(dir_name, root + ambig_suffix + '.userplot')
                if not os.path.exists(full_name): continue
                if not os.path.exists(full_ambig_name): continue
                
                output_suffix = '-%d.userplot' % radius 

                print dir_name, root, output_suffix
                
                depths = numpy.array( read_unstranded_userplot(full_name) )
                ambig_depths = numpy.array( read_unstranded_userplot(full_ambig_name) )
                expect = expected_depth(root, seq, depths, ambig_depths, radius)
                
                write_unstranded_userplot(
                    os.path.join(dir_name, root + suffix + '-expected' + output_suffix),
                    expect) 
                
                corrected = depths / expect * numpy.median(expect)
                corrected[expect <= 5.0] = 0.0
                write_unstranded_userplot(
                    os.path.join(dir_name, root + suffix + '-corrected' + output_suffix),
                    corrected)                 
                
                ambig_corrected = ambig_depths / expect * numpy.median(expect)
                ambig_corrected[expect <= 0.0] = 0.0
                write_unstranded_userplot(
                    os.path.join(dir_name, root + ambig_suffix + '-corrected' + output_suffix),
                    ambig_corrected)                 
开发者ID:drpowell,项目名称:nesoni,代码行数:42,代码来源:trivia.py


示例14: set_sequences

    def set_sequences(self, filenames):        
        reference_genbank_filename = self / 'reference.gbk'
        reference_filename = self / 'reference.fa'

        reference_genbank_file = open(reference_genbank_filename,'wb')
        any_genbank = [ False ]

        def genbank_callback(name, record):
            """ Make a copy of any genbank files passed in. """
            from Bio import SeqIO
            
            SeqIO.write([record], reference_genbank_file, 'genbank')
            
            f = open(self / (grace.filesystem_friendly_name(name) + '.gbk'), 'wb')
            SeqIO.write([record], f, 'genbank')
            f.close()
            
            any_genbank[0] = True
        
        lengths = [ ]
        seen = set()
        f = open(reference_filename, 'wb')
        for filename in filenames:
            for name, seq in io.read_sequences(filename, genbank_callback=genbank_callback):
                name = name.split()[0]
                assert name not in seen, 'Duplicate chromosome name: ' + name
                seen.add(name)
                lengths.append( (name, len(seq)) )
                io.write_fasta(f, name, seq)
        f.close()        
        self.set_object(lengths, 'reference-lengths.pickle.gz')
        
        reference_genbank_file.close()
        if not any_genbank[0]:
            os.unlink(reference_genbank_filename)
            
        # Create an index of the reference sequences for samtools
        io.execute([
            'samtools', 'faidx', reference_filename
        ])
开发者ID:Victorian-Bioinformatics-Consortium,项目名称:nesoni,代码行数:40,代码来源:reference_directory.py


示例15: run

    def run(self):
        #mincov, args = grace.get_option_value(args, '--mincov', int, 1) 
        #maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16) 
        #minsize, args = grace.get_option_value(args, '--minsize', int, 200)
        #what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core')    
        #is_core = (what == 'core') 
        #
        #grace.expect_no_further_options(args)
        #
        #if len(args) < 2:
        #    print >> sys.stderr, HELP
        #    raise grace.Help_shown()
        #
        #output_dir, working_dirs = args[0], args[1:]
        #
        ##assert not path.exists(path.join(output_dir, 'reference.fa')), \
        #assert not path.exists(path.join(output_dir, 'parameters')), \
        #        'Output directory not given'
        #
        #if not path.exists(output_dir):
        #    os.mkdir(output_dir)

        assert self.what in ('core','unique'), 'Expected --what to be either "core" or "unique".'
        is_core = (self.what == 'core') 
        
        workspace = self.get_workspace()
        
        for name, seq in io.read_sequences(working_directory.Working(self.working_dirs[0]).get_reference().reference_fasta_filename()):
            self.log.log(name + '\n')
            friendly_name = grace.filesystem_friendly_name(name)
            
            good = [ True ] * len(seq)
            
            for working_dir in self.working_dirs:
                if is_core:
                   suffix = '-depth.userplot'
                else:
                   suffix = '-ambiguous-depth.userplot'
                data = trivia.read_unstranded_userplot(
                    os.path.join(working_dir, friendly_name+suffix)
                )
                assert len(seq) == len(data)
                for i in xrange(len(seq)):
                   if good[i]:
                       if is_core:
                           good[i] = data[i] >= self.mincov
                       else:
                           good[i] = data[i] < self.mincov
    
            #Close holes
            start = -self.maxdiff-1
            n_holes = 0
            for i in xrange(len(seq)):
                if good[i]:
                     if 0 < i-start <= self.maxdiff:
                         for j in xrange(start,i): good[j] = True
                         n_holes += 1
                     start = i+1
            self.log.log('Closed '+grace.pretty_number(n_holes)+' holes\n')
            
            
            f = open( workspace/('%s-%s.fa' % (friendly_name,self.what)), 'wb')
            io.write_fasta(f, name,
                ''.join([ (seq[i] if good[i] else 'N')
                          for i in xrange(len(seq)) ])
            )
            f.close()
    
            f = open( workspace/('%s-%s_masked.fa' % (friendly_name,self.what)), 'wb')
            io.write_fasta(f, name,
                ''.join([ (seq[i] if good[i] else seq[i].lower())
                          for i in xrange(len(seq)) ])
            )
            f.close()
    
            f_good = open( workspace/('%s-%s_parts.fa' % (friendly_name,self.what)), 'wb')
            f_nongood = open( workspace/('%s-non%s_parts.fa' % (friendly_name,self.what)), 'wb')
            start = 0
            n_good = [0]
            n_good_bases = [0]    
            def emit(i):
                if i-start < self.minsize: return
                if good[start]:
                    n_good[0] += 1
                    n_good_bases[0] += i-start
                io.write_fasta(
                    f_good if good[start] else f_nongood,
                    '%s:%d..%d' % (name, start+1,i),
                    seq[start:i]
                )
            for i in xrange(1,len(seq)):
                if good[i] != good[start]:
                    emit(i)
                    start = i
            emit(len(seq))
            f_nongood.close()
            f_good.close()
            
            self.log.log(grace.pretty_number(sum(good))+' bases are '+self.what+', of '+grace.pretty_number(len(seq))+' in reference sequence\n')
            self.log.log(grace.pretty_number(n_good[0])+' parts at least '+grace.pretty_number(self.minsize)+' bases long with '+grace.pretty_number(n_good_bases[0])+' total bases\n')
#.........这里部分代码省略.........
开发者ID:Puneet-Shivanand,项目名称:nesoni,代码行数:101,代码来源:core.py


示例16: recombination

def recombination(args):
    grace.expect_no_further_options(args)
    if len(args) != 2:
        print >> sys.stderr, USAGE
        raise grace.Help_shown()

    working_dir, seq_name = args

    references = dict(io.read_sequences(os.path.join(working_dir, 'reference.fa')))
    
    depth = { }
    prefixes = { }
    suffixes = { }
    for name in references:
        depth[name] = numpy.zeros(len(references[name]), 'int64')
        prefixes[name] = [ [] for base in references[name] ]
        suffixes[name] = [ [] for base in references[name] ]
    def register_divergence(hit):
        if not hit.query_forward:
            hit = hit.reversed()
        
        margin = 20
        
        if hit.target_end - hit.target_start < 20: 
            return False
        
        depth[hit.target_name][hit.target_start : hit.target_end] += 1

        any = False
        
        if hit.query_end <= len(hit.query_seq)-margin: # and hit.target_end < len(hit.target_seq):
            suffixes[hit.target_name][hit.target_end-1].append( hit.query_seq[hit.query_end:] )
            any = True
        
        if hit.query_start >= margin: # and hit.target_start > 0:
            prefixes[hit.target_name][hit.target_start].append( hit.query_seq[:hit.query_start] )
            any = True
        
        return any

    n = 0
    for (read_name, read_seq), hits in shrimp.iter_read_hits(working_dir):
        # Skip reads containing Ns
        if 'N' in read_seq: continue
    
        for line in hits:
            register_divergence(alignment_from_shrimp(line, references, read_name, read_seq))
        
        n += 1
        #if n > 100000:
        #    break
            
        if n%10000 == 0:
            grace.status('Processing read %s' % grace.pretty_number(n))

    grace.status('')

    
    def show_items(items):
        original_length = len(items)
        cut = 0
        while len(items) > 80:
            cut += 1
            items = [ item for item in items if item[0] >= cut ]
        for item in items:
            print item[1]
        if len(items) < original_length:
            print '(and %d more occurring %d times or less)' % (original_length-len(items), cut-1) 
    
    def score(items):
        if not items: return 1.0
        return float(sum( item[0] * item[0] for item in items )) / (sum( item[0] for item in items )**2)
    
    def summarize_prefixes(seqs, pad):
        seqs = sorted(seqs, key=lambda seq: seq[::-1])
        
        cut = 100
        while True:
            items = [ ] 
            for (seq, iterator) in itertools.groupby(seqs, key = lambda x: x[-cut:]):
                ss = list(iterator)
                anylong = any( item != seq for item in ss )            
                n = len(ss)
                items.append( (n, ('%'+str(pad)+'s')%(('...' if anylong else '') + seq) + ' x %d' % n) )
            
            if score(items) >= 1.0/20: break
            cut -= 1
        
        show_items(items)
        
    def summarize_suffixes(seqs, pad):
        seqs = sorted(seqs)
        
        cut = 100
        while True:
            items = [ ] 
            for (seq, iterator) in itertools.groupby(seqs, key = lambda x: x[:cut]):
                ss = list(iterator)            
                anylong = any( item != seq for item in ss )            
                n = len(ss)
#.........这里部分代码省略.........
开发者ID:Puneet-Shivanand,项目名称:nesoni,代码行数:101,代码来源:recombination.py


示例17: fill_scaffolds

def fill_scaffolds(args):
    max_filler_length, args = grace.get_option_value(args, '--max-filler', int, 4000)
    
    if len(args) < 2:
        print USAGE
        return 1
    
    (output_dir, graph_dir), args = args[:2], args[2:]

    scaffolds = [ ]
    
    def scaffold(args):
        circular, args = grace.get_option_value(args, '--circular', grace.as_bool, False)
        
        scaffold = [ ]
        for item in args:
            scaffold.append( ('contig', int(item)) )
            scaffold.append( ('gap', None) )
        
        if not circular: scaffold = scaffold[:-1]
        
        name = 'custom_scaffold_%d' % (len(scaffolds)+1)
        scaffolds.append( (name, scaffold) )
            
    grace.execute(args, [scaffold])
    
    custom_scaffolds = (len(scaffolds) != 0)    
    
    sequences = dict( 
        (a.split()[0], b.upper()) 
          for a,b in 
            io.read_sequences(os.path.join(
              graph_dir, '454AllContigs.fna')))
    
    sequence_names = sorted(sequences)
    sequence_ids = dict(zip(sequence_names, xrange(1,len(sequence_names)+1)))
    
    contexts = { }
    context_names = { }
    context_depths = { }
    for i in xrange(1,len(sequence_names)+1):
        seq = sequences[sequence_names[i-1]]
        contexts[ i ] = seq
        context_names[ i ] = sequence_names[i-1]+'-fwd'
        contexts[ -i ] = bio.reverse_complement(seq)
        context_names[ -i ] = sequence_names[i-1]+'-rev'
    
    links = collections.defaultdict(list)
    
    for line in open(
      os.path.join(graph_dir, '454ContigGraph.txt'),
      'rU'):
        parts = line.rstrip('\n').split('\t')
        
        if parts[0].isdigit():
            seq = sequence_ids[parts[1]]
            context_depths[ seq] = float(parts[3])
            context_depths[-seq] = float(parts[3])
        
        if parts[0] == 'C':    
            name1 = 'contig%05d' % int(parts[1])
            dir1 = {"3'" : 1, "5'" : -1 }[parts[2]]
            name2 = 'contig%05d' % int(parts[3])
            dir2 = {"5'" : 1, "3'" : -1 }[parts[4]]
            depth = int(parts[5])
            #print name1, dir1, name2, dir2, depth
            
            links[ sequence_ids[name1] * dir1 ].append( (depth, sequence_ids[name2] * dir2) )
            links[ sequence_ids[name2] * -dir2 ].append( (depth, sequence_ids[name1] * -dir1) )
    
        if parts[0] == 'S' and not custom_scaffolds:  
            name = 'scaffold%05d' % int(parts[2])  
            components = parts[3].split(';')
            scaffold = [ ]
            for component in components:
                a,b = component.split(':')
                if a == 'gap':
                    scaffold.append( ('gap',int(b)) )
                else:
                    strand = { '+': +1, '-': -1 }[ b ]
                    scaffold.append( ('contig', sequence_ids['contig%05d'%int(a)] * strand) )
            scaffolds.append( (name, scaffold) )
    
    
    
    #paths = { }
    #
    #todo = [ ]
    #for i in contexts:
    #    for depth_left, neg_left in links[-i]:
    #        left = -neg_left
    #        for depth_right, right in links[i]:
    #            todo.append( ( max(-depth_left,-depth_right,-context_depths[i]), left, right, (i,)) )
    #
    #heapq.heapify(todo)
    #while todo:
    #    score, source, dest, path = heapq.heappop(todo)
    #    if (source,dest) in paths: continue
    #    
    #    paths[(source,dest)] = path
#.........这里部分代码省略.........
开发者ID:Puneet-Shivanand,项目名称:nesoni,代码行数:101,代码来源:fill_scaffolds.py


示例18: run

    def run(self):
        #assert not self.utr_only or self.utrs, '--utrs-only yes but no --utrs given'
        
        # Reference genome
        
        #chromosome_lengths = reference_directory.Reference(self.reference, must_exist=True).get_lengths()
        chromosomes = collections.OrderedDict(io.read_sequences(self.reference))

        def get_interpeak_seq(peaks):
            start = min(item.transcription_stop for item in peaks)
            end = max(item.transcription_stop for item in peaks)
            if end-start > self.max_seq: return ''
            if peaks[0].strand >= 0:
                return chromosomes[peaks[0].seqid][start:end]
            else:
                return bio.reverse_complement(chromosomes[peaks[0].seqid][start:end])

        def get_prepeak_seq(gene,peaks):
            if gene.strand >= 0:
                start = gene.utr_pos
                end = min(item.transcription_stop for item in peaks)
                if end-start > self.max_seq: return ''
                return chromosomes[gene.seqid][start:end]
            else:
                start = max(item.transcription_stop for item in peaks)
                end = gene.utr_pos
                if end-start > self.max_seq: return ''
                return bio.reverse_complement(chromosomes[gene.seqid][start:end])
        
        # Normalization files
        
        if self.norm_file:
            norm_file = self.norm_file
        else:
            nesoni.Norm_from_counts(self.prefix+'-norm', self.counts).run()
            norm_file = self.prefix+'-norm.csv'

        norms = io.read_grouped_table(norm_file, [('All',str)])['All']
        pair_norm_names = [ ]
        pair_norms = [ ]
        for i in xrange(len(norms)):
            pair_norm_names.append(norms.keys()[i]+'-peak1')
            pair_norms.append(norms.values()[i])
        for i in xrange(len(norms)):
            pair_norm_names.append(norms.keys()[i]+'-peak2')
            pair_norms.append(norms.values()[i])
        io.write_grouped_csv(
            self.prefix+'-pairs-norm.csv',
            [('All',io.named_list_type(pair_norm_names)(pair_norms))],
            comments=['#Normalization'],
            )


        # Read data
        
        annotations = list(annotation.read_annotations(self.parents))
        if self.utrs:
            utrs = list(annotation.read_annotations(self.utrs))
        else:
            utrs = [ ]
        children = list(annotation.read_annotations(self.children))
        
        count_table = io.read_grouped_table(self.counts, [
            ('Count',int),
            ('Tail_count',int),
            ('Tail',_float_or_none),
            ('Proportion',_float_or_none),
            ('Annotation',str)
            ])
        counts = count_table['Count']
        tail_counts = count_table['Tail_count']
        proportions = count_table['Proportion']
        tails = count_table['Tail']
        
        samples = counts.value_type().keys()
        sample_tags = { }
        for line in count_table.comments:
            if line.startswith('#sampleTags='):
                parts = line[len('#sampleTags='):].split(',')
                assert parts[0] not in sample_tags
                sample_tags[parts[0]] = parts
        
        for item in children:
            item.weight = sum( counts[item.get_id()][name] * float(norms[name]['Normalizing.multiplier']) for name in samples )
        
        parents = [ ]
        id_to_parent = { }
        for item in annotations:
            if item.type != self.parent_type: continue
            assert item.get_id() not in id_to_parent, 'Duplicate id in parent file: '+item.get_id()
            parents.append(item)
            id_to_parent[item.get_id()] = item
            item.children = [ ]
            #item.cds = [ ]
            
            # Default utr
            if item.strand >= 0:
               item.utr_pos = item.end
            else:
               item.utr_pos = item.start
#.........这里部分代码省略.........
开发者ID:stu2,项目名称:tail-tools,代码行数:101,代码来源:alternative_tails.py


示例19: run

    def run(self):
        references = { }
        for filename in self.reference_filenames:
            for name, seq in io.read_sequences(filename):
                references[name] = seq
        
        tail_lengths = { }
        adaptor_bases = { }
        for filename in self.clips:
            with io.open_possibly_compressed_file(filename) as f:
                for line in f:
                    if line.startswith('#'): continue
                    parts = line.rstrip('\n').split('\t')
                    name = parts[0].split()[0]
                    tail_lengths[name] = int(parts[3])-int(parts[2])
                    adaptor_bases[name] = int(parts[6])
        
        in_file = self.begin_input()
        out_file = self.begin_output()
        
        assert self.prop_a >= 0.0 and self.prop_a <= 1.0
        a_score = 1-self.prop_a
        non_a_score = -self.prop_a
        
        for line in in_file:
            line = line.rstrip()
            if line.startswith('@'):
                print >> out_file, line
                continue
            
            al = Alignment(line)
            
            if al.flag & FLAG_UNMAPPED:
                continue

            #ref = references[al.rname]

            reverse = al.flag & FLAG_REVERSE
            if reverse:
                read_bases = rev_comp(al.seq)
                read_qual = al.qual[::-1]
                cigar = cigar_decode(al.cigar)[::-1]
            else:
                read_bases = al.seq
                read_qual = al.qual
                cigar = cigar_decode(al.cigar)
            
            n_tail = tail_lengths[al.qname]
            
            #if reverse:
            #    if al.pos-1-n_tail < 0: continue #TODO: handle tail extending beyond end of reference
            #    bases_ref = rev_comp(ref[al.pos-1-n_tail:al.pos-1])    
            #else:
            #    if al.pos-1+al.length+n_tail > len(ref): continue #TODO: handle tail extending beyond end of reference
            #    bases_ref = ref[al.pos-1+al.length:al.pos-1+al.length+n_tail] .upper()#upper was missing for a long time. Bug!
            #
            #extension = 0
            #while extension < n_tail and bases_ref[extension] == 'A':
            #    extension += 1
            
            if reverse:
                feat = annotation.Annotation(al.rname, start=al.pos-1-n_tail, end=al.pos-1, strand=-1)
            else:
                feat = annotation.Annotation(al.rname, start=al.pos-1+al.length, end=al.pos-1+al.length+n_tail, strand=1)
            bases_ref = feat.get_seq(references).upper()
            
            # Allow up to 60% mismatch on As
            # Treat soft clipping as insertion for simplicity
            cigar = cigar.replace("S","I")
            assert "H" not in cigar, "Can't handle hard clipping"
            
            extension = 0
            best_score = 0.0
            score = 0.0
            
            # Soft clipping treated as a mismatch
            i = len(cigar)-1
            while i >= 0 and cigar[i] in "I":
                score += non_a_score
                i -= 1
            
            for i in xrange(n_tail):
                if bases_ref[i] == "A":
                    score += a_score
                else:
                    score += non_a_score
                    
                if score >= best_score:
                    extension = i+1
                    best_score = score
            #print >> sys.stderr, reverse!=0, n_tail, extension, bases_ref
                      
            
            if n_tail-extension > 0:
                al.extra.append('AN:i:%d' % (n_tail-extension))
                al.extra.append('AG:i:%d' % (extension))
            if adaptor_ 

鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python nest.set_verbosity函数代码示例发布时间:2022-05-27
下一篇:
Python io.execute函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap