本文整理汇总了Python中utils.grouper函数的典型用法代码示例。如果您正苦于以下问题:Python grouper函数的具体用法?Python grouper怎么用?Python grouper使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了grouper函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: compute_descriptors
def compute_descriptors(infile, descriptor_types):
"""Reads low-level descriptors from DenseTracks."""
LEN_LINE = 436
POS_IDXS = [1, 2, 0] # Position coordinates (X, Y, T).
NORM_POS_IDXS = [7, 8, 9] # Normalized position coordinates (X, Y, T).
dense_tracks = subprocess.Popen(
[DENSE_TRACK, infile],
stdout=subprocess.PIPE)
for lines in grouper(dense_tracks.stdout, NR_DESCRIPTORS):
all_descs = np.vstack([
map(float, line.split())
for line in lines
if line is not None]
).astype(np.float32)
assert all_descs.shape[0] <= NR_DESCRIPTORS
assert all_descs.shape[1] == LEN_LINE
positions = all_descs[:, POS_IDXS]
normalized_positions = all_descs[:, NORM_POS_IDXS]
descriptors = {
desc_type: all_descs[:, DESC_IDXS[desc_type]]
for desc_type in descriptor_types}
yield positions, normalized_positions, descriptors
开发者ID:martin-xavier,项目名称:medpackage,代码行数:29,代码来源:densetrack_to_fisher_shot_errorprotect.py
示例2: main
def main():
logger = configure_logging('parse_serverstatus')
client = InfluxDBClient(host=args.influxdb_host, ssl=args.ssl, verify_ssl=False, port=8086, database=args.database)
with open(args.input_file, 'r') as f:
for line_number, chunk in enumerate(grouper(f, args.batch_size)):
# print(line_number)
json_points = []
for line in chunk:
# zip_longest will backfill any missing values with None, so we need to handle this, otherwise we'll miss the last batch
if line:
try:
server_status_json = json.loads(line)
# print((line_number + 0) * _BATCH_SIZE)
# print((line_number + 1) * _BATCH_SIZE)
common_metric_data = get_metrics("serverstatus", server_status_json, common_metrics, line_number)
json_points.append(create_point(*common_metric_data))
wiredtiger_metric_data = get_metrics("serverstatus_wiredtiger", server_status_json, wiredtiger_metrics, line_number)
json_points.append(create_point(*wiredtiger_metric_data))
# for metric_data in get_metrics(server_status_json, common_metrics, line_number):
# import ipdb; ipdb.set_trace()
# print(json_points)
# json_points.append(create_point(*metric_data))
# # for metric in get_metrics(server_status_json, wiredtiger_metrics, line_number):
# json_points.append(create_point(*metric))
# for metric in get_metrics(server_status_json, mmapv1_metrics, line_number):
# json_points.append(create_point(*metric))
except ValueError:
logger.error("Line {} does not appear to be valid JSON - \"{}\"".format(line_number, line.strip()))
write_points(logger, client, json_points, line_number)
开发者ID:jimoleary,项目名称:mongo-insight,代码行数:29,代码来源:parse_serverstatus.py
示例3: main
def main(args):
global DEBUG
if len(args) == 1:
# no args - repl
while True:
print 'que?>',
try:
print google_it(raw_input())
except EOFError:
break
except:
import traceback
traceback.print_exc()
else:
# test mode
DEBUG = False
print 'Loading testfile...'
tests = filter(bool, open(args[1]).read().split('\n'))
print len(tests), 'tests'
for clue, answer in utils.grouper(2, tests):
clue = clue.split('~!clue')[1]
answer = answer.split("~!answer")[1]
try:
print '----------------------------------------------------------------'
print 'clue:', clue
print 'correct:', answer
print 'eubank:', google_it(clue)
except KeyboardInterrupt:
sys.exit(0)
except:
import traceback
traceback.print_exc()
开发者ID:andrewgjohnson-forks,项目名称:Eubank,代码行数:33,代码来源:jeopardy.py
示例4: main
def main():
description = 'Split a FASTA file into multiple subfiles.'
parser = ArgumentParser(description=description,
parents=[get_default_argument_parser()])
parser.add_argument('-f', '--in-format',
default=_DEFAULT_FMT,
help="A biopython file format string.")
parser.add_argument('-n', '--num-files', type=int,
default=_DEFAULT_N,
help=("The number of splits. "
"DEFAULT=%d") % _DEFAULT_N)
parser.add_argument('in_path', nargs='?', default=None,
help=("The path of the file to be read in. "
"If no argument given, reads from STDIN."))
parser.add_argument('out_pattern', default=None,
help=("Output file names format string. "
"Must contain one '%%d' for the file number."))
args = parser.parse_args()
if args.in_path is None:
record_parser = SeqIO.parse(sys.stdin, args.in_format)
else:
record_parser = SeqIO.parse(args.in_path, args.in_format)
write_multithread(grouper(record_parser, 100),
lambda recs, handle:
SeqIO.write(recs, handle, args.in_format),
args.out_pattern, n=args.num_files)
开发者ID:bsmith89,项目名称:rrnum,代码行数:28,代码来源:split_seqs.py
示例5: train
def train(self, sentences, total_words=None, word_count=0, chunksize=100):
"""
Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
Each sentence must be a list of utf8 strings.
"""
logger.info("training model on %i vocabulary and %i features" % (len(self.vocab), self.layer1_size))
if not self.vocab:
raise RuntimeError("you must first build vocabulary before training the model")
start, next_report = time.time(), 1.0
if not total_words:
total_words = sum(v.count for v in self.vocab.itervalues())
# convert input string lists to Vocab objects (or None for OOV words)
no_oov = ([self.vocab.get(word, None) for word in sentence] for sentence in sentences)
# run in chunks of e.g. 100 sentences (= 1 job)
for job in utils.grouper(no_oov, chunksize):
# update the learning rate before every job
alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count / total_words))
# how many words did we train on? out-of-vocabulary (unknown) words do not count
job_words = sum(train_sentences(self, sentence, alpha) for sentence in job)
word_count += job_words
# report progress
elapsed = time.time() - start
if elapsed >= next_report:
logger.info("PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s" %
(100.0 * word_count / total_words, alpha, word_count / elapsed if elapsed else 0.0))
next_report = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports
elapsed = time.time() - start
logger.info("training on %i words took %.1fs, %.0f words/s" %
(word_count, elapsed, word_count / elapsed if elapsed else 0.0))
return word_count
开发者ID:nudles,项目名称:word2vec,代码行数:32,代码来源:word2vec.py
示例6: __iter__
def __iter__(self):
if self.chunksize:
for chunk in utils.grouper(self.corpus, self.chunksize):
for transformed in self.obj.__getitem__(chunk, chunksize=None):
yield transformed
else:
for doc in self.corpus:
yield self.obj[doc]
开发者ID:Anikacyp,项目名称:gensim,代码行数:8,代码来源:interfaces.py
示例7: train
def train(self, sentences, total_words=None, word_count=0, chunksize=100):
"""
Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
Each sentence must be a list of utf8 strings.
"""
logger.info("training model with %i workers on %i vocabulary and %i features" % (self.workers, len(self.vocab), self.layer1_size))
if not self.vocab:
raise RuntimeError("you must first build vocabulary before training the model")
start, next_report = time.time(), [1.0]
word_count, total_words = [word_count], total_words or sum(v.count for v in self.vocab.itervalues())
jobs = Queue(maxsize=2 * self.workers) # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :(
lock = threading.Lock() # for shared state (=number of words trained so far, log reports...)
def worker_train():
"""Train the model, lifting lists of sentences from the jobs queue."""
work = matutils.zeros_aligned(self.layer1_size, dtype=REAL) # each thread must have its own work memory
while True:
job = jobs.get()
if job is None: # data finished, exit
break
# update the learning rate before every job
alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words))
# how many words did we train on? out-of-vocabulary (unknown) words do not count
job_words = sum(train_sentence(self, sentence, alpha, work) for sentence in job)
with lock:
word_count[0] += job_words
elapsed = time.time() - start
if elapsed >= next_report[0]:
logger.info("PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s" %
(100.0 * word_count[0] / total_words, alpha, word_count[0] / elapsed if elapsed else 0.0))
next_report[0] = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports
workers = [threading.Thread(target=worker_train) for _ in xrange(self.workers)]
for thread in workers:
thread.daemon = True # make interrupting the process with ctrl+c easier
thread.start()
# convert input strings to Vocab objects (or None for OOV words), and start filling the jobs queue
no_oov = ([self.vocab.get(word, None) for word in sentence] for sentence in sentences)
for job_no, job in enumerate(utils.grouper(no_oov, chunksize)):
logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize()))
jobs.put(job)
logger.info("reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize())
for _ in xrange(self.workers):
jobs.put(None) # give the workers heads up that they can finish -- no more work!
for thread in workers:
thread.join()
elapsed = time.time() - start
logger.info("training on %i words took %.1fs, %.0f words/s" %
(word_count[0], elapsed, word_count[0] / elapsed if elapsed else 0.0))
return word_count[0]
开发者ID:MorLong,项目名称:word2vec-1,代码行数:58,代码来源:word2vec.py
示例8: import_json
def import_json():
for g in grouper(1000,sys.stdin):
try:
Model.database.bulk_save([json.loads(l) for l in g if l])
except BulkSaveError as err:
if any(d['error']!='conflict' for d in err.errors):
raise
else:
logging.warn("conflicts for %r",[d['id'] for d in err.errors])
开发者ID:JeffAMcGee,项目名称:localcrawl,代码行数:9,代码来源:admin.py
示例9: read_slr
def read_slr(fh):
stats = fh.readline()
seqs = []
for l in utils.grouper(fh, 2):
name = l[0].rstrip()
seq = l[1].rstrip()
seqs.append(SeqRecord(id=name, seq=Seq(seq), description=""))
return seqs
开发者ID:pomeranz,项目名称:tree_stats,代码行数:10,代码来源:prepare_fubar.py
示例10: __init__
def __init__(self, horn_pointing=False, siamfile=None):
self.horn_pointing = horn_pointing
if siamfile is None:
siamfile = private.siam
f = open(siamfile)
lines = f.readlines()
self.siam = {}
for line in grouper(4,lines[1:]):
chtag = line[0].split()[0]
m = np.array(np.matrix(';'.join(line[1:])))
self.siam[chtag] = m
开发者ID:tskisner,项目名称:planck,代码行数:11,代码来源:pointingtools.py
示例11: import_old_json
def import_old_json():
for g in grouper(1000,sys.stdin):
docs = [json.loads(l) for l in g if l]
for d in docs:
del d['doc_type']
for k,v in d.iteritems():
if k[-2:]=='id' or k in ('rtt','rtu'):
d[k]=v[1:]
for field in ['ats','fols','frs']:
if field in d and isinstance(d[field],list):
d[field] = [u[1:] for u in d[field]]
Model.database.bulk_save(docs)
开发者ID:JeffAMcGee,项目名称:localcrawl,代码行数:12,代码来源:admin.py
示例12: xfory
def xfory(price_info, units):
""" function to discount per groups. if you pay Y you get X """
total = 0
x = price_info.get('x')
y = price_info.get('y')
price = price_info.get('unitPrice')
for group in grouper(x, range(0, units)):
has_discount = len(group) == x
per_unit = price if not has_discount else y / x * price
total = total + (per_unit * len(group))
return total / units
开发者ID:abelgvidal,项目名称:exercises,代码行数:13,代码来源:rules.py
示例13: command_service
def command_service(self, rawCommand):
"""
Parse raw input and execute specified function with args
:param rawCommand: csv string from Matlab/Simulink of the form:
'command, namedArg1, arg1, namedArg2, arg2, ..., namedArgN, argN'
:return: the command and arguments as a dictionary
"""
pack = [x.strip() for x in split('[,()]*', rawCommand.strip())]
raw_cmd = pack[0]
argDict = {key: literal_eval(value) for key, value in utils.grouper(pack[1:], 2)}
cmd = self.mapInterface.commands[raw_cmd]
ret = cmd(**argDict)
logger.info("Command '{}' run with args {}".format(raw_cmd, argDict))
return raw_cmd, ret
开发者ID:friend0,项目名称:world_engine,代码行数:15,代码来源:server.py
示例14: train
def train(self,triples, total_triples=None, triples_count = 0, chunksize=1000):
if not self.vocab or not self.vocab_rel:
raise RuntimeError("you must first build entity and relation vocabulary before training the model")
start,next_report = time.time(),[1.0]
triples_count = [triples_count]
total_triples = total_triples or int(sum(1 for v in triples))
jobs = Queue(maxsize=2*self.workers)
lock = threading.Lock()
def worker_train():
work = zeros(self.layer1_size, dtype=REAL)
detR = zeros((self.layer1_size,self.layer1_size),dtype=REAL)
# neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
while True:
job = jobs.get()
if job is None:
break
alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * triples_count[0] / total_triples))
job_triples = self._get_job_triples(alpha,job,work,detR)
with lock:
triples_count[0] += job_triples
elapsed = time.time() - start
if elapsed>= next_report[0]:
logger.info("PROGRESS: at %.2f%% triplrs, alpha %.05f, %.0f triples/s" %
(100.0 * triples_count[0] / total_triples, alpha, triples_count[0] / elapsed if elapsed else 0.0))
next_report[0] = elapsed + 1.0
workers = [threading.Thread(target=worker_train) for _ in xrange(self.workers)]
for thread in workers:
thread.daemon = True # make interrupting the process with ctrl+c easier
thread.start()
# convert input strings to Vocab objects (eliding OOV/downsampled words), and start filling the jobs queue
for job_no, job in enumerate(utils.grouper(self._prepare_triples(triples), chunksize)):
logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize()))
jobs.put(job)
logger.info("reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize())
for _ in xrange(self.workers):
jobs.put(None) # give the workers heads up that they can finish -- no more work!
for thread in workers:
thread.join()
elapsed = time.time() - start
logger.info("training on %i triples took %.1fs, %.0f triples/s" %
(triples_count[0], elapsed, triples_count[0] / elapsed if elapsed else 0.0))
self.syn0norm = None
return triples_count[0]
开发者ID:v-shinc,项目名称:KB2Vec,代码行数:48,代码来源:kb2vec.py
示例15: fetch_edges
def fetch_edges():
Edges.database = connect("houtx_edges")
User.database = connect("away_user")
old_edges = set(int(row['id']) for row in Edges.database.paged_view("_all_docs",endkey="_"))
uids = set(_users_from_scores())-old_edges
settings.pdb()
for g in grouper(100,uids):
for user in twitter.user_lookup(g):
if user is None or user.protected: continue
try:
edges = twitter.get_edges(user._id)
except restkit.errors.Unauthorized:
logging.warn("unauthorized!")
continue
except restkit.errors.ResourceNotFound:
logging.warn("resource not found!?")
continue
edges.save()
user.save()
sleep_if_needed()
开发者ID:JeffAMcGee,项目名称:localcrawl,代码行数:20,代码来源:admin.py
示例16: compute_descriptors
def compute_descriptors(infile, descriptor_type):
"""Reads low-level descriptors from DenseTracks."""
LEN_LINE = 436
POS_IDXS = [1, 2, 0] # Positional coordinates (X, Y, T).
dense_tracks = subprocess.Popen(
['./DenseTrack', infile], stdout=subprocess.PIPE)
descriptor_idxs = DESC_IDXS[descriptor_type]
for lines in grouper(dense_tracks.stdout, NR_DESCRIPTORS):
all_descs = np.vstack([
map(float, line.split())
for line in lines
if line is not None]
).astype(np.float32)
assert all_descs.shape[0] <= NR_DESCRIPTORS
assert all_descs.shape[1] == LEN_LINE
yield all_descs[:, POS_IDXS], all_descs[:, descriptor_idxs]
开发者ID:martin-xavier,项目名称:medpackage,代码行数:22,代码来源:fisher_vector.py
示例17: main
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-model', type=str, required=True)
parser.add_argument('-weights', type=str, required=True)
parser.add_argument('-results', type=str, required=True)
args = parser.parse_args()
model = model_from_json(open(args.model).read())
model.load_weights(args.weights)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
questions_val = open('../data/preprocessed/questions_val2014.txt',
'r').read().decode('utf8').splitlines()
answers_val = open('../data/preprocessed/answers_val2014.txt',
'r').read().decode('utf8').splitlines()
images_val = open('../data/preprocessed/images_val2014.txt',
'r').read().decode('utf8').splitlines()
vgg_model_path = '../features/coco/vgg_feats.mat'
print 'Model compiled, weights loaded...'
labelencoder = joblib.load('../models/labelencoder.pkl')
features_struct = scipy.io.loadmat(vgg_model_path)
VGGfeatures = features_struct['feats']
print 'loaded vgg features'
image_ids = open('../features/coco_vgg_IDMap.txt').read().splitlines()
img_map = {}
for ids in image_ids:
id_split = ids.split()
img_map[id_split[0]] = int(id_split[1])
nlp = English()
print 'loaded word2vec features'
nb_classes = 1000
y_predict_text = []
batchSize = 128
widgets = ['Evaluating ', Percentage(), ' ', Bar(marker='#',left='[',right=']'),
' ', ETA()]
pbar = ProgressBar(widgets=widgets)
for qu_batch,an_batch,im_batch in pbar(zip(grouper(questions_val, batchSize, fillvalue=questions_val[0]),
grouper(answers_val, batchSize, fillvalue=answers_val[0]),
grouper(images_val, batchSize, fillvalue=images_val[0]))):
X_q_batch = get_questions_matrix_sum(qu_batch, nlp)
if 'language_only' in args.model:
X_batch = X_q_batch
else:
X_i_batch = get_images_matrix(im_batch, img_map , VGGfeatures)
X_batch = np.hstack((X_q_batch, X_i_batch))
y_predict = model.predict_classes(X_batch, verbose=0)
y_predict_text.extend(labelencoder.inverse_transform(y_predict))
correct_val=0
incorrect_val=0
f1 = open(args.results, 'w')
for prediction, truth, question, image in zip(y_predict_text, answers_val, questions_val, images_val):
temp_count=0
for _truth in truth.split(';'):
if prediction == _truth:
temp_count+=1
if temp_count>2:
correct_val+=1
else:
incorrect_val+=1
f1.write(question.encode('utf-8'))
f1.write('\n')
f1.write(image.encode('utf-8'))
f1.write('\n')
f1.write(prediction)
f1.write('\n')
f1.write(truth.encode('utf-8'))
f1.write('\n')
f1.write('\n')
f1.write('Final Accuracy is ' + str(float(correct_val)/(incorrect_val+correct_val)))
f1.close()
f1 = open('../results/overall_results.txt', 'a')
f1.write(args.weights + '\n')
f1.write(str(float(correct_val)/(incorrect_val+correct_val)) + '\n')
f1.close()
print 'Final Accuracy on the validation set is', float(correct_val)/(incorrect_val+correct_val)
开发者ID:Goddard,项目名称:visual-qa,代码行数:85,代码来源:evaluateMLP.py
示例18: main
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-num_hidden_units', type=int, default=512)
parser.add_argument('-num_lstm_layers', type=int, default=2)
parser.add_argument('-dropout', type=float, default=0.2)
parser.add_argument('-activation', type=str, default='tanh')
parser.add_argument('-num_epochs', type=int, default=100)
parser.add_argument('-model_save_interval', type=int, default=5)
parser.add_argument('-batch_size', type=int, default=128)
parser.add_argument('-word_vector', type=str, default='')
args = parser.parse_args()
questions_train = open('../data/preprocessed/questions_train2014.txt', 'r').read().decode('utf8').splitlines()
questions_lengths_train = open('../data/preprocessed/questions_lengths_train2014.txt', 'r').read().decode('utf8').splitlines()
answers_train = open('../data/preprocessed/answers_train2014.txt', 'r').read().decode('utf8').splitlines()
images_train = open('../data/preprocessed/images_train2014.txt', 'r').read().decode('utf8').splitlines()
max_answers = 1000
questions_train, answers_train, images_train = selectFrequentAnswers(questions_train,answers_train,images_train, max_answers)
print 'Loaded questions, sorting by length...'
questions_lengths_train, questions_train, answers_train = (list(t) for t in zip(*sorted(zip(questions_lengths_train, questions_train, answers_train))))
#encode the remaining answers
labelencoder = preprocessing.LabelEncoder()
labelencoder.fit(answers_train)
nb_classes = len(list(labelencoder.classes_))
joblib.dump(labelencoder,'../models/labelencoder.pkl')
max_len = 30 #25 is max for training, 27 is max for validation
word_vec_dim = 300
model = Sequential()
model.add(LSTM(output_dim = args.num_hidden_units, activation='tanh',
return_sequences=True, input_shape=(max_len, word_vec_dim)))
model.add(Dropout(args.dropout))
model.add(LSTM(args.num_hidden_units, return_sequences=False))
model.add(Dense(nb_classes, init='uniform'))
model.add(Activation('softmax'))
json_string = model.to_json()
model_file_name = '../models/lstm_language_only_num_hidden_units_' + str(args.num_hidden_units) + '_num_lstm_layers_' + str(args.num_lstm_layers) + '_dropout_' + str(args.dropout)
open(model_file_name + '.json', 'w').write(json_string)
print 'Compiling model...'
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
print 'Compilation done...'
#set up word vectors
# Code to choose the word vectors, default is Goldberg but GLOVE is preferred
if args.word_vector == 'glove':
nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
else:
nlp = English()
print 'loaded ' + args.word_vector + ' word2vec features...'
## training
# Moved few variables to args.parser (num_epochs, batch_size, model_save_interval)
print 'Training started...'
for k in xrange(args.num_epochs):
progbar = generic_utils.Progbar(len(questions_train))
for qu_batch,an_batch,im_batch in zip(grouper(questions_train, args.batch_size, fillvalue=questions_train[0]),
grouper(answers_train, args.batch_size, fillvalue=answers_train[0]),
grouper(images_train, args.batch_size, fillvalue=images_train[0])):
timesteps = len(nlp(qu_batch[-1])) #questions sorted in descending order of length
X_q_batch = get_questions_tensor_timeseries(qu_batch, nlp, timesteps)
Y_batch = get_answers_matrix(an_batch, labelencoder)
loss = model.train_on_batch(X_q_batch, Y_batch)
# fix for the Keras v0.3 issue #9
progbar.add(args.batch_size, values=[("train loss", loss[0])])
if k%args.model_save_interval == 0:
model.save_weights(model_file_name + '_epoch_{:02d}.hdf5'.format(k))
model.save_weights(model_file_name + '_epoch_{:02d}.hdf5'.format(k+1))
开发者ID:iamaaditya,项目名称:visual-qa,代码行数:77,代码来源:trainLSTM_language.py
示例19: BeautifulSoup
learnset_url += "{0}_(Pok%C3%A9mon)/Generation_I_learnset".format(pname)
html_file = urllib2.urlopen(learnset_url)
learnset_html = html_file.read()
html_file.close()
bs = BeautifulSoup(learnset_html)
x = [td.text for td in bs.findAll("td")
if 0 < len(td.text) < 60]
# if td.text in movename_to_num.values()] worked pretty well, but...
# Just grabbing everything that appears anywhere and is a valid move
# name will grab Psychic, when those characters only appeared to indicate
# the type of a move and not the Move Psychic
# So instead, group them into clumps... it seems to group very consistently
grouped = list(grouper(x, 6))
# Pikachu had a weird move: Light Screen, which he learns at Level 50 in
# Pokemon Yellow, but never in Red/Blue. So just grabbing the values in the
# table that are valid moves would actually lead us to believe that Pikachu
# can learn Light Screen, which he can, but it doesn't have a TM until Gen
# 3. Instead, let's group the entries, drop the ones from Pokemon Yellow,
# and then grab the remaining moves
not_yellow = [entry for entry in grouped if not entry[0].endswith("Y")]
# fix a problem that Vaporean == 106 was having
valid_starts = ("T", "H", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9")
valid = [entry for entry in not_yellow
if entry[0].startswith(valid_starts)]
moves = [standardize(entry[1]) for entry in valid
开发者ID:hoxiea,项目名称:pokemodel,代码行数:31,代码来源:get_learnsets.py
示例20: train
def train(self, sentences, total_words=None, word_count=0, sent_count=0, chunksize=100):
"""
Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
Each sentence must be a list of unicode strings.
"""
logger.info("training model with %i workers on %i sentences and %i features, "
"using 'skipgram'=%s 'hierarchical softmax'=%s 'subsample'=%s and 'negative sampling'=%s" %
(self.workers, self.sents_len, self.layer1_size, self.sg, self.hs, self.sample, self.negative))
if not self.vocab:
raise RuntimeError("you must first build vocabulary before training the model")
start, next_report = time.time(), [1.0]
word_count = [word_count]
sent_count = [sent_count]
total_words = total_words or sum(v.count * v.sample_probability for v in itervalues(self.vocab))
total_sents = self.total_sents #it's now different from self.sents_len
jobs = Queue(maxsize=2 * self.workers) # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :(
lock = threading.Lock() # for shared state (=number of words trained so far, log reports...)
def worker_train():
"""Train the model, lifting lists of sentences from the jobs queue."""
work = matutils.zeros_aligned(self.layer1_size + 8, dtype=REAL) # each thread must have its own work memory
neu1 = matutils.zeros_aligned(self.layer1_size + 8, dtype=REAL)
while True:
job = jobs.get()
if job is None: # data finished, exit
break
# update the learning rate before every job
if self.update_mode == 0:
alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words))
else:
alpha = self.alpha
job_words = sum(train_sent_vec(self, self.sents[sent_no], sentence, alpha, work, neu1, self.sents_grad[sent_no])
for sent_no, sentence in job)
with lock:
word_count[0] += job_words
sent_count[0] += chunksize
elapsed = time.time() - start
if elapsed >= next_report[0]:
logger.info("PROGRESS: at %.2f%% sents, alpha %.05f, %.0f words/s" %
(100.0 * sent_count[0] / total_sents, alpha, word_count[0] / elapsed if elapsed else 0.0))
next_report[0] = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports
workers = [threading.Thread(target=worker_train) for _ in xrange(self.workers)]
for thread in workers:
thread.daemon = True # make interrupting the process with ctrl+c easier
thread.start()
def prepare_sentences():
for sent_tuple in sentences:
sentence = sent_tuple[0]
sent_id = sent_tuple[1]
sent_no = self.sent_no_hash[sent_id]
sampled = [self.vocab.get(word, None) for word in sentence
if word in self.vocab and (self.vocab[word].sample_probability >= 1.0 or self.vocab[word].sample_probability >= random.random_sample())]
yield (sent_no, sampled)
# convert input strings to Vocab objects (eliding OOV/downsampled words), and start filling the jobs queue
for job_no, job in enumerate(utils.grouper(prepare_sentences(), chunksize)):
logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize()))
jobs.put(job)
logger.info("reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize())
for _ in xrange(self.workers):
jobs.put(None) # give the workers heads up that they can finish -- no more work!
for thread in workers:
thread.join()
elapsed = time.time() - start
logger.info("training on %i words took %.1fs, %.0f words/s" %
(word_count[0], elapsed, word_count[0] / elapsed if elapsed else 0.0))
return word_count[0]
开发者ID:nathan2718,项目名称:category2vec,代码行数:76,代码来源:sent2vec.py
注:本文中的utils.grouper函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论