本文整理汇总了Python中pyfasta.Fasta类的典型用法代码示例。如果您正苦于以下问题:Python Fasta类的具体用法?Python Fasta怎么用?Python Fasta使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Fasta类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: run
def run(self, filename):
self.openOutFiles(filename)
f = Fasta(filename)
count = len(f)
self.not_found_in_kabat, self.fr4_not_found, current = (0, 0, 0)
for name in f.keys():
current += 1
if current % 1000 == 0:
print "All %d. Current: %d" % (count, current)
# format: vName_jName{frameNumber} or vName_dName{frameNumber}_jName{frameNumber}
vGeneName = name.split("_")[0]
vGeneRegions = self.getVGeneRegions(vGeneName)
if vGeneRegions is None:
continue
withoutMarkup = f[name][vGeneRegions[self.kabat.regions_count * 2 - 1]:]
group = self.findFR4(name, withoutMarkup)
if group is None:
continue
self.result_kabat_file.write(name)
self.result_kabat_file.write(("\t%d" * 10) % tuple(vGeneRegions))
self.result_kabat_file.write(("\t%d" * 4 + "\n") % tuple(
[vGeneRegions[9] + i for i in [1, group.start(), group.start() + 1, len(withoutMarkup)]]))
self.closeOutFiles()
print "all: {}; not in kabat: {}; without fr4: {}".format(current, self.not_found_in_kabat, self.fr4_not_found)
开发者ID:biocad,项目名称:au-summer-2013,代码行数:31,代码来源:markup.py
示例2: parse_sequences
def parse_sequences(sites, size, fasta_file):
"""Adds the binding site sequences extende to 'size' per row (decoded as A=0, C=1, G=2, T=3) to each input region."""
from pyfasta import Fasta # Fasta package is needed to fetch sequences from genome fasta file
print "INFO: Begin to fetch sequences...."
f = Fasta(fasta_file, key_fn=lambda key: key.split()[0])
for i, reg in enumerate(sites):
start = reg["ext_start"]
end = reg["ext_end"]
# if motif on negativ strand, shift region by +1 to account for zero based half-open intervals
if reg["strand"] == '-':
start += 1
end += 1
seq = f.sequence({"chr":reg["chr"], "start":start, "stop":end}, one_based=False)
# Note, the 'strand':reg["strand"] argument for f.sequence does not work, there seems to be a bug in the pyfasta/fasta.py code.
seq = seq.upper()
# if motif on negative strand, convert seq to reverse complement
if reg["strand"] == '-':
seq = reverse_complement(seq)
# add sequence to region dict
reg["ext_seq"] = seq
print "INFO: Finished sequences."
return regions
开发者ID:ComputationalSystemsBiology,项目名称:ExoProfiler,代码行数:32,代码来源:5primeCounter.py
示例3: calc_nuc_counts
def calc_nuc_counts(fasta_filename, region_size_min,
region_size_max, verbose):
''' calculate nuc frequencies for normalization.
Returns: dict of nucleotide frequencies.
'''
nuc_counts = defaultdict(Counter)
fasta = Fasta(fasta_filename)
for chrom, seq in fasta.items():
for idx, pos in enumerate(seq):
for region_size in range(region_size_min,
region_size_max + 1):
nucs = seq[idx:idx+region_size]
if len(nucs) < region_size: continue
nuc_counts[region_size][nucs] += 1
return nuc_counts
开发者ID:speach,项目名称:modmap,代码行数:25,代码来源:genome_nuc_freqs.py
示例4: aa_seq
def aa_seq(options):
""" Gets the ancestral sequence from a Fasta file
"""
f = Fasta(options.ancestralfasta)
keyz = (f.keys())
match = ''
if (options.single_chromosome):
# Single chromosome fasta should only have one sequence.
# that sequence should be the sequence of interest.
keyz = list(keyz)
key = keyz[0]
else:
get_chromosome_from_header = options.header
get_chromosome_from_header = \
get_chromosome_from_header.replace('?', options.chromosome)
for key in keyz:
if(re.match(get_chromosome_from_header, key) is not None):
match = key
if(match is ''):
raise Exception("No match possible is something wrong with the"
" regex specified to the program as"
"--header-regex")
aaSeq = f[key]
return(aaSeq)
开发者ID:MMesbahU,项目名称:selectionTools,代码行数:25,代码来源:aa_annotate.py
示例5: _no_empty
def _no_empty(self, lista, listb):
''' removes empty entries '''
# check for empty fasta.
tmpa = list()
tmpb = list()
for i in range(len(listb)):
# open it.
try:
z = Fasta(listb[i], record_class=MemoryRecord)
# check for empty.
if len(z.keys()) == 0:
continue
# add to temp.
tmpa.append(lista[i])
tmpb.append(listb[i])
except:
logging.warning("bad fasta file")
# sort back.
return tmpa, tmpb
开发者ID:jim-bo,项目名称:parabio,代码行数:25,代码来源:nucmer.py
示例6: create_fasta_flat_file
def create_fasta_flat_file(file):
"""Reads a fasta file for fast sequence retrival"""
fasta_file = Fasta(file, key_fn=lambda key: key.split()[0])
fasta_headers = set(fasta_file.keys());
return fasta_file, fasta_headers
开发者ID:henrikstranneheim,项目名称:VariantUtilities,代码行数:8,代码来源:splicer.py
示例7: genome_contenct_stats
def genome_contenct_stats(fasta_path):
f = Fasta(fasta_path)
g_box_total = []
for seqid in f.keys():
seq = f[seqid][:]
g_boxs = len(re.findall("CACGTG", seq, flags=re.IGNORECASE))
g_box_total.append(g_boxs)
print >> sys.stderr, "total gboxes:{0}".format(sum(g_box_total))
开发者ID:gturco,项目名称:random_bio_tools,代码行数:8,代码来源:genome_stats.py
示例8: check_keyfn2
def check_keyfn2(path, klass, inplace):
f = Fasta(path, record_class=klass, flatten_inplace=inplace, key_fn=lambda
key: "-".join(key.split()))
assert sorted(f.keys()) == ['a-extra', 'b-extra', 'c-extra'], f.keys()
assert f['a-extra']
fix(path)
开发者ID:brentp,项目名称:pyfasta,代码行数:8,代码来源:test_all.py
示例9: read_fa
def read_fa(fa='/Share/home/zhangqf7/gongjing/zebrafish/data/reference/transcriptome/danRer10.refSeq.transcriptome.fa'):
gj.printFuncRun('read_fa')
gj.printFuncArgs()
fa_dict1 = Fasta(fa, key_fn=lambda key:key.split("\t")[0])
fa_dict = {i.split()[0]:j[0:] for i,j in fa_dict1.items()}
print fa_dict.keys()[0:3]
gj.printFuncRun('read_fa')
return fa_dict
开发者ID:Tsinghua-gongjing,项目名称:test,代码行数:8,代码来源:atcg_stats.py
示例10: split
def split(args):
parser = optparse.OptionParser("""\
split a fasta file into separated files.
pyfasta split -n 6 [-k 5000 ] some.fasta
the output will be some.0.fasta, some.1.fasta ... some.6.fasta
the sizes will be as even as reasonable.
""")
parser.add_option("--header", dest="header", metavar="FILENAME_FMT",
help="""this overrides all other options. if specified, it will
split the file into a separate file for each header. it
will be a template specifying the file name for each new file.
e.g.: "%(fasta)s.%(seqid)s.fasta"
where 'fasta' is the basename of the input fasta file and seqid
is the header of each entry in the fasta file.""" ,default=None)
parser.add_option("-n", "--n", type="int", dest="nsplits",
help="number of new files to create")
parser.add_option("-o", "--overlap", type="int", dest="overlap",
help="overlap in basepairs", default=0)
parser.add_option("-k", "--kmers", type="int", dest="kmers", default=-1,
help="""\
split big files into pieces of this size in basepairs. default
default of -1 means do not split the sequence up into k-mers, just
split based on the headers. a reasonable value would be 10Kbp""")
options, fasta = parser.parse_args(args)
if not (fasta and (options.nsplits or options.header)):
sys.exit(parser.print_help())
if isinstance(fasta, (tuple, list)):
assert len(fasta) == 1, fasta
fasta = fasta[0]
kmer = options.kmers if options.kmers != -1 else None
overlap = options.overlap if options.overlap != 0 else None
f = Fasta(fasta)
if options.header:
names = dict([(seqid, options.header % \
dict(fasta=f.fasta_name, seqid=seqid)) \
for seqid in f.iterkeys()])
"""
if len(names) > 0:
assert names[0][1] != names[1][1], ("problem with header format", options.header)
fhs = dict([(seqid, open(fn, 'wb')) for seqid, fn in names[:200]])
fhs.extend([(seqid, StringIO(), fn) for seqid, fn in names[200:]])
"""
return with_header_names(f, names)
else:
names = newnames(fasta, options.nsplits, kmers=kmer, overlap=overlap,
header=options.header)
#fhs = [open(n, 'wb') for n in names]
if options.kmers == -1:
return without_kmers(f, names)
else:
return with_kmers(f, names, options.kmers, options.overlap)
开发者ID:brentp,项目名称:pyfasta,代码行数:55,代码来源:split_fasta.py
示例11: mask_to_bed
def mask_to_bed(fasta_file, mask_bed_name):
"creates a bed file of the start and stops of masked seqs"
mask_bed = open(mask_bed_name,"wb")
f= Fasta(fasta_file)
mask_id = 1
for seqid in f.keys():
seq = f[seqid][:]
for m in re.finditer("X+",seq):
mask_id = mask_id + 1
w = '{0}\t{1}\t{2}\t{3}\t{4}\t+\t.\t.\t.\t1\t{5}\t0\n'.format(seqid,m.start(),m.end(),"mask_id {0}".format(mask_id),(m.end()-m.start()),(m.end()-m.start()+1))
mask_bed.write(w)
开发者ID:gturco,项目名称:random_bio_tools,代码行数:11,代码来源:cns_to_bed.py
示例12: write_c2t
def write_c2t(fasta_name, unconverted, colorspace=False):
"""
given a fasta file, write a new file:
`some.fr.c2t.fasta` which contains:
+ the same headers prefixed with 'f' with all C's converted to T
+ headers prefixed with 'r' reverse complemented with
all C's converted to T.
if unconverted is false, then also save a file with the forward and reverse
without conversion.
"""
d = op.join(op.dirname(fasta_name), "bowtie_index")
if colorspace: d += "_colorspace"
if not op.exists(d): os.mkdir(d)
p, ext = op.splitext(op.basename(fasta_name)) # some.fasta -> some, fasta
fname = "%s/%s.fr.c2t%s" % (d, p, ext)
# no conversion, just copy the file into the index dir.
unconverted_fname = "%s/%s.fr%s" % (d, p, ext)
if op.exists(fname):
if not unconverted: return fname, unconverted_fname
elif op.exists(unconverted_fname): return fname, unconverted_fname
fasta = Fasta(fasta_name)
c2t_fh = open(fname, 'w')
unc_fh = open(unconverted_fname, 'w') if unconverted else None
print >>sys.stderr, "writing forward and reverse c2t to: %s" % (fname,)
try:
for header in fasta.iterkeys():
seq = str(fasta[header]).upper()
assert not ">" in seq
# c2t, prefix header with f and write
print >>c2t_fh, ">f%s" % header
print >>c2t_fh, seq.replace('C', 'T')
# then r-c, c2t, prefix header with r and write
print >>c2t_fh, ">r%s" % header
rseq = revcomp(seq)
print >>c2t_fh, rseq.replace('C', 'T')
if unc_fh is not None:
print >>unc_fh, ">f%s\n%s" % (header, seq)
print >>unc_fh, ">r%s\n%s" % (header, rseq)
c2t_fh.close()
except:
os.unlink(fname)
os.unlink(unconverted_fname)
raise
return fname, unconverted_fname
开发者ID:BioinformaticsArchive,项目名称:methylcode,代码行数:52,代码来源:__init__.py
示例13: mask
def mask(fasta_file, org, cutoff, mask_value='X'):
h5, node = get_node(org, 'r')
outfile = fasta_file[:fasta_file.rfind(".")] + (".masked.%i" % cutoff) \
+ fasta_file[fasta_file.rfind("."):]
print "> masking sequence to file:", outfile
out = open(outfile ,'w')
fasta = Fasta(fasta_file)
soft_mask = mask_value.lower() == 'soft'
for seqid in sorted(fasta.iterkeys()):
masked = 0
if soft_mask:
seq = str(fasta[seqid])
# mask is the lowercase sequence.
mask_value = np.array(seq.lower(), dtype='c')
seq = np.array(seq.upper(), dtype='c')
else:
fasta[seqid].tostring = False
seq = fasta[seqid][:] # a
if not 'c' + seqid in node:
print >>sys.stderr, seqid,\
'! not found in masked, writing unchanged\n' \
' this means that no section of this sequence appeared\n' \
' more than %i times' % cutoff
out.write('>' + seqid + '\n')
out.write(seq.tostring() + '\n')
continue
hit_counts = getattr(node, 'c' + seqid)[:]
masked_seq = np.where(numexpr.evaluate("hit_counts > %i" % cutoff)
, mask_value, seq).tostring()
l = len(masked_seq)
print >>sys.stderr, "! seq:%s len:%i %%masked:%.3f" % (seqid, l,
100.0 * masked_seq.count(mask_value) / l)
assert len(seq) == l
out.write('>' + seqid + '\n')
out.write(masked_seq + '\n')
out.close()
# write out a file .fasta.version containing
# the svnversion (if available of this script
# that was used to create the file.
path = os.path.dirname(__file__)
os.system('svnversion %s > %s.version' % (path, outfile))
h5.close()
开发者ID:gturco,项目名称:find_cns,代码行数:51,代码来源:mask_genome.py
示例14: main
def main(gff_file, outdir):
"""empty docstring"""
name = re.compile("parent=([^.;]+)", re.I)
feats = {}
non_cds_feats = collections.defaultdict(list)
for line in open(gff_file):
line = line.split("\t")
match = re.search(name, line[-1])
if not match:
continue
fname = match.groups(0)[0]
non_cds_feats[fname].append(line)
if line[2].upper() == "CDS":
feats[fname] = True
continue
if fname in feats:
continue
feats[fname] = None
i = 0
for k, v in sorted(feats.items()):
if not v is None:
del non_cds_feats[k]
seen = {}
RNA = open(outdir + "/at_non_cds.gff", "w")
for k, feat_list in sorted(non_cds_feats.items()):
for feat in feat_list:
if feat[0] in ("ChrC", "ChrM"):
continue
if feat[2] == "exon":
continue
key = (feat[0], feat[3], feat[4])
if key in seen:
continue
feat[0] = feat[0].upper().replace("CHR", "")
seen[key] = True
feat[-1] = k
print >> RNA, "\t".join(feat)
RNA.close()
gff = read_gff(outdir + "/at_non_cds.gff")
fasta = Fasta("/home/gturco/src/find_cns_gturco/pipeline/data/arabidopsis.fasta")
ftypes = {}
FA = open(outdir + "/at_rnas.fasta", "w")
for chr, feature_list in gff.iteritems():
for fname, feature in feature_list.iteritems():
seq = fasta.sequence(feature)
print >> FA, ">", feature["name"]
print >> FA, seq
FA.close()
开发者ID:gturco,项目名称:find_cns,代码行数:51,代码来源:arabidopsis_rna.py
示例15: check_kmer_overlap
def check_kmer_overlap(f):
chr2 = f['chr2']
kmers = Fasta.as_kmers(chr2, 10, overlap=2)
for i, k in enumerate(list(kmers)[:-1]):
assert (len(k[1]) == 10)
assert (k[0] == (i * (10 - 2)))
kmers = Fasta.as_kmers(chr2, 10, overlap=4)
seqs = [k[1] for k in kmers]
paired_seqs = zip(seqs[0:-1], seqs[1:])
for a, b in paired_seqs:
if len(a) < 4 or len(b) < 4: continue
assert (a[-4:] == b[:4])
开发者ID:jamescasbon,项目名称:pyfasta,代码行数:14,代码来源:test_all.py
示例16: segments
def segments(self):
'''
Generator for Segments
'''
startchr = self.start_chromosome
start = self.start_location
chrs = [x[0] for x in sorted(self.fasta.index.items(), key=lambda a: a[1][0])]
for chr in chrs:
segcount = 0
if self.verbose:
print "Reading chr %s" % chr
# Skip forward if a starting chr was defined
if startchr is not None and startchr != chr:
continue
else:
startchr = None
for kmer in Fasta.as_kmers(self.fasta[chr],self.segment_size):
end = start + self.segment_size
seg = Segment(start, end, kmer[1] ,chr)
segcount += 1
if self.verbose and segcount % 1000 == 0:
print "Read %d segments" % segcount
yield seg
start = end
开发者ID:harvardinformatics,项目名称:gx,代码行数:25,代码来源:PyfastaReader.py
示例17: create_pyfasta_iterator
def create_pyfasta_iterator(self, **kwargs):
from pyfasta import Fasta
print "Generating PyFasta sequence index. This may take a moment...."
self.fasta = Fasta(kwargs['input'])
self.readcount = len(self.fasta)
self.db_values = zip(range(len(self.fasta)), sorted(self.fasta.keys()))
self.read = iter(self.db_values)
开发者ID:faircloth-lab,项目名称:msatcommander-gs,代码行数:7,代码来源:design_primers.py
示例18: check_kmers
def check_kmers(f):
seq = str(f['chr2'])
kmers = list(Fasta.as_kmers(f['chr2'], 10))
assert (len(kmers) == len(seq) / 10)
assert (kmers[0] == (0, seq[:10]))
seqs = [k[1] for k in kmers]
assert ("".join(seqs) == seq)
last_pair = kmers[-1]
assert (seqs[-1][-1] == 'T')
seq = str(f['chr3'])
kmers = list(Fasta.as_kmers(f['chr3'], 1))
assert (kmers[2][0] == 2)
seqs = [k[1] for k in kmers]
assert ("".join(seqs) == seq)
开发者ID:jamescasbon,项目名称:pyfasta,代码行数:17,代码来源:test_all.py
示例19: Reference
class Reference(object):
def __init__(self, genome_fasta):
# @see: https://pypi.python.org/pypi/pyfasta
key_fn = lambda key : key.split()[0] # Use first value before whitespace as keys
self.fasta = Fasta(genome_fasta, key_fn=key_fn)
def get_sequence_from_iv(self, iv):
feature_hash = {'chr' : iv.chrom, 'start' : iv.start, 'stop' : iv.end, 'strand' : iv.strand}
return self.fasta.sequence(feature_hash, one_based=False)
开发者ID:henmt,项目名称:2015,代码行数:9,代码来源:reference.py
示例20: generate_corpusfile
def generate_corpusfile(fasta_fname, n, corpus_fname):
'''
Args:
fasta_fname: corpus file name
n: the number of chunks to split. In other words, "n" for "n-gram"
corpus_fname: corpus_fnameput corpus file path
Description:
Protvec uses word2vec inside, and it requires to load corpus file
to generate corpus.
'''
f = open(corpus_fname, "w")
fasta = Fasta(fasta_fname)
for record_id in tqdm(fasta.keys(), desc='corpus generation progress'):
r = fasta[record_id]
seq = str(r)
ngram_patterns = split_ngrams(seq, n)
for ngram_pattern in ngram_patterns:
f.write(" ".join(ngram_pattern) + "\n")
f.close()
开发者ID:kyu999,项目名称:biovec,代码行数:19,代码来源:models.py
注:本文中的pyfasta.Fasta类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论