本文整理汇总了Python中themis.logger.info函数的典型用法代码示例。如果您正苦于以下问题:Python info函数的具体用法?Python info怎么用?Python info使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了info函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: truth_coverage
def truth_coverage(corpus, truth, systems_data):
"""
Statistics about which answers came from the truth set broken down by system.
:param corpus: corpus generated by 'xmgr corpus' command
:type corpus: pandas.DataFrame
:param truth: question to answer mapping used in training
:type truth: pandas.DataFrame
:param systems_data: collated results for all systems
:type systems_data: pandas.DataFrame
:return: truth coverage summary statistics
:rtype: pandas.DataFrame
"""
truth_answers = pandas.merge(corpus, truth, on=ANSWER_ID)[ANSWER].drop_duplicates()
n = len(corpus)
m = len(truth_answers)
logger.info("%d answers out of %d possible answers in truth (%0.3f%%)" % (m, n, 100.0 * m / n))
systems_data = pandas.concat(systems_data).dropna()
answers = systems_data.groupby(SYSTEM)[[CORRECT]].count()
answers_in_truth = systems_data[systems_data[ANSWER].isin(truth_answers)].groupby(SYSTEM)[[ANSWER]]
summary = answers_in_truth.count()
summary["Answers"] = answers
summary = summary.rename(columns={ANSWER: "Answers in Truth"})
summary["Answers in Truth %"] = 100 * summary["Answers in Truth"] / summary["Answers"]
correct_answers = systems_data[systems_data[CORRECT]]
correct_answers_in_truth = correct_answers[correct_answers[ANSWER].isin(truth_answers)]
summary["Correct Answers"] = correct_answers.groupby(SYSTEM)[CORRECT].count()
summary["Correct Answers in Truth"] = correct_answers_in_truth.groupby(SYSTEM)[CORRECT].count()
summary["Correct Answers in Truth %"] = 100 * summary["Correct Answers in Truth"] / summary["Correct Answers"]
return summary[
["Answers", "Correct Answers",
"Answers in Truth", "Answers in Truth %",
"Correct Answers in Truth", "Correct Answers in Truth %"]].sort_values("Correct Answers", ascending=False)
开发者ID:louisroehrs,项目名称:themis,代码行数:33,代码来源:analyze.py
示例2: get_items
def get_items(item_type, names, checkpoint, get_item, write_frequency):
"""
Given a list of item names and a checkpoint, this function recovers any previously checkpointed items, then gets
the remaining items and writes them to a checkpoint.
:param item_type: name of item type for use in logging
:type item_type: str
:param names: list of item names
:type names: list
:param checkpoint: checkpoint to periodically write items to
:type checkpoint: DataFrameCheckpoint
:param get_item: function that returns an item given a name
:type get_item: func
:param write_frequency: how often to log a process message
:type write_frequency: int
:return: the checkpoint
:rtype: DataFrameCheckpoint
"""
recovered = checkpoint.recovered
if recovered:
logger.info("Recovered %d %s from previous run" % (len(recovered), item_type))
total = len(names)
start = 1 + len(recovered)
try:
names_to_get = sorted(set(names) - recovered)
for i, name in enumerate(names_to_get, start):
if i == start or i == total or i % write_frequency == 0:
logger.info("Get " + percent_complete_message(item_type, i, total))
item = get_item(name)
checkpoint.write(name, item)
finally:
checkpoint.close()
return checkpoint
开发者ID:DharmendraVaghela,项目名称:themis,代码行数:33,代码来源:checkpoint.py
示例3: kfold_split
def kfold_split(df, outdir, _folds=5, _training_header=False):
"""
Split the data-set into equal training and testing sets. Put training and testing set into local directory
as csv files.
:param df: data frame to be splited
:param outdir: output directory path
:param _folds: number of folds to be performed
:param _training_header: header og the training file
:return: list of directory for training set and teting set
"""
# Randomize the order of the input dataframe
df = df.iloc[np.random.permutation(len(df))]
df = df.reset_index(drop=True)
foldSize = int(math.ceil(len(df) / float(_folds)))
logger.info("Total records: " + str(len(df)))
logger.info("Fold size: " + str(foldSize))
logger.info("Results written to output folder " + outdir)
for x in range(0, _folds):
fold_low = x * foldSize
fold_high = (x + 1) * foldSize
if fold_high >= len(df):
fold_high = len(df)
test_df = df.iloc[fold_low:fold_high]
train_df = df.drop(df.index[fold_low:fold_high])
test_df.to_csv(os.path.join(outdir, 'Test' + str(x) + '.csv'), encoding='utf-8', index=False)
train_df.to_csv(os.path.join(outdir, 'Train' + str(x) + '.csv'), header=_training_header, encoding='utf-8', index=False)
logger.info("--- Train_Fold_" + str(x) + ' size = ' + str(len(train_df)))
logger.info("--- Test_Fold_" + str(x) + ' size = ' + str(len(test_df)))
开发者ID:cognitive-catalyst,项目名称:themis,代码行数:35,代码来源:analyze.py
示例4: interpret_annotation_assist
def interpret_annotation_assist(annotation_assist, judgment_threshold):
"""
Convert the file produced by the Annotation Assist tool into a set of judgments that can be used by Themis.
Convert the in purview column from an integer value to a boolean. Convert the annotation score column to a boolean
correct column by applying a threshold. An answer can only be correct if the question is in purview. Drop any Q&A
pairs that have multiple annotations.
:param annotation_assist: Annotation Assist judgments
:type annotation_assist: pandas.DataFrame
:param judgment_threshold: threshold above which an answer is deemed correct
:type judgment_threshold: pandas.DataFrame
:return: Annotation Assist judgments with a boolean Correct column
:rtype: pandas.DataFrame
"""
qa_duplicates = annotation_assist[[QUESTION, ANSWER]].duplicated()
if any(qa_duplicates):
n = sum(qa_duplicates)
logger.warning(
"Dropping %d Q&A pairs with multiple annotations (%0.3f%%)" % (n, 100.0 * n / len(annotation_assist)))
annotation_assist.drop_duplicates((QUESTION, ANSWER), keep=False, inplace=True)
annotation_assist[IN_PURVIEW] = annotation_assist[IN_PURVIEW].astype("bool")
annotation_assist[CORRECT] = \
annotation_assist[IN_PURVIEW] & (annotation_assist[ANNOTATION_SCORE] >= judgment_threshold)
logger.info("Processed %d judgments" % len(annotation_assist))
return annotation_assist.drop(ANNOTATION_SCORE, axis="columns")
开发者ID:DharmendraVaghela,项目名称:themis,代码行数:26,代码来源:judge.py
示例5: system_similarity
def system_similarity(systems_data):
"""
For each system pair, return the number of questions they answered the same.
:param systems_data: collated results for all systems
:type systems_data: pandas.DataFrame
:return: table of pairs of systems and their similarity statistics
:rtype: pandas.DataFrame
"""
systems_data = drop_missing(systems_data)
systems = systems_data[SYSTEM].drop_duplicates().sort_values()
columns = ["System 1", "System 2", "Same Answer", "Same Answer %"]
results = pandas.DataFrame(columns=columns)
for x, y in itertools.combinations(systems, 2):
data_x = systems_data[systems_data[SYSTEM] == x]
data_y = systems_data[systems_data[SYSTEM] == y]
m = pandas.merge(data_x, data_y, on=QUESTION)
n = len(m)
logger.info("%d question/answer pairs in common for %s and %s" % (n, x, y))
same_answer = sum(m["%s_x" % ANSWER] == m["%s_y" % ANSWER])
same_answer_pct = 100.0 * same_answer / n
results = results.append(
pandas.DataFrame([[x, y, same_answer, same_answer_pct]], columns=columns))
results["Same Answer"] = results["Same Answer"].astype("int64")
return results.set_index(["System 1", "System 2"])
开发者ID:louisroehrs,项目名称:themis,代码行数:25,代码来源:analyze.py
示例6: get_truth_from_mapped_questions
def get_truth_from_mapped_questions(mapped_questions):
def get_pau_mapping(question):
if "predefinedAnswerUnit" in question:
return question["predefinedAnswerUnit"]
elif "mappedQuestion" in question:
question_id = question["mappedQuestion"]["id"]
try:
mapped_question = questions[question_id]
except KeyError:
logger.warning("Question %s mapped to non-existent question %s" % (question["id"], question_id))
return None
return get_pau_mapping(mapped_question)
else:
return None
unmapped = 0
# Index the questions by their question id so that mapped questions can be looked up.
questions = dict([(question["id"], question) for question in mapped_questions])
for question in questions.values():
question[ANSWER_ID] = get_pau_mapping(question)
if question[ANSWER_ID] is None:
unmapped += 1
questions = [q for q in questions.values() if q[ANSWER_ID] is not None]
question_ids = [q["id"] for q in questions]
question_text = [q["text"] for q in questions]
answer_id = [q[ANSWER_ID] for q in questions]
truth = pandas.DataFrame.from_dict({QUESTION_ID: question_ids, QUESTION: question_text, ANSWER_ID: answer_id})
logger.info("%d mapped, %d unmapped" % (len(truth), unmapped))
return truth
开发者ID:DharmendraVaghela,项目名称:themis,代码行数:29,代码来源:xmgr.py
示例7: deakin
def deakin(usage_log):
low_confidence_response = usage_log[ANSWER].str.contains(
"Here's Watson's response, but remember it's best to use full sentences.")
logger.info("Removed %d questions with low confidence responses" % sum(low_confidence_response))
usage_log = usage_log[~low_confidence_response]
usage_log = filter_usage_log_by_user_experience(usage_log, ["Dialog Response"])
usage_log = fix_confidence_ranges(usage_log)
return usage_log
开发者ID:cognitive-catalyst,项目名称:themis,代码行数:8,代码来源:fixup.py
示例8: get_pau
def get_pau(pau_id):
paus = xmgr.get_paus(pau_id)
if paus:
pau = paus[0]
return {ANSWER: pau["responseMarkup"], TITLE: pau["title"], FILENAME: pau["sourceName"]}
else:
logger.info("Could not download pau %s" % pau_id)
return None
开发者ID:DharmendraVaghela,项目名称:themis,代码行数:8,代码来源:xmgr.py
示例9: judge_sample_handler
def judge_sample_handler(args):
questions = pandas.concat(args.judgments)[[QUESTION]].drop_duplicates()
sample = pandas.merge(questions, args.frequency, on=QUESTION, how="left")
n = len(sample)
logger.info("%d judged questions" % n)
m = sum(sample[FREQUENCY].isnull())
if m:
logger.warning("Missing frequencies for %d questions (%0.3f%%)" % (m, 100.0 * m / n))
print_csv(QuestionFrequencyFileType.output_format(sample))
开发者ID:ManaliChanchlani,项目名称:themis,代码行数:9,代码来源:main.py
示例10: download_corpus_from_xmgr
def download_corpus_from_xmgr(xmgr, output_directory, checkpoint_frequency, max_docs):
"""
Download the corpus from an XMGR project
A corpus is a mapping of answer text to answer Ids. It also contains answer titles and the names of the documents
from which the answers were extracted.
This can take a long time to complete, so intermediate results are saved in the directory. If you restart an
incomplete download it will pick up where it left off.
:param xmgr: connection to an XMGR project REST API
:type xmgr: XmgrProject
:param output_directory: directory into which write the corpus.csv file
:type output_directory: str
:checkpoint_frequency: how often to write intermediate results to a checkpoint file
:type checkpoint_frequency: int
:param max_docs: maximum number of corpus documents to download, if None, download them all
:type max_docs: int
"""
document_ids_csv = os.path.join(output_directory, "document_ids.csv")
corpus_csv = os.path.join(output_directory, "corpus.csv")
if os.path.isfile(corpus_csv) and not os.path.isfile(document_ids_csv):
logger.info("Corpus already downloaded")
return
logger.info("Download corpus from %s" % xmgr)
document_ids = sorted(set(document["id"] for document in xmgr.get_documents()))
document_ids = document_ids[:max_docs]
n = len(document_ids)
downloaded_document_ids = DataFrameCheckpoint(document_ids_csv, [DOCUMENT_ID, "Paus"], checkpoint_frequency)
corpus = DataFrameCheckpoint(corpus_csv, CorpusFileType.columns)
try:
if downloaded_document_ids.recovered:
logger.info("Recovered %d documents from previous run" % len(downloaded_document_ids.recovered))
document_ids = sorted(set(document_ids) - downloaded_document_ids.recovered)
m = len(document_ids)
start = len(downloaded_document_ids.recovered) + 1
if m:
for i, document_id in enumerate(document_ids, start):
if i % checkpoint_frequency == 0 or i == start or i == m:
corpus.flush()
logger.info(percent_complete_message("Get PAUs from document", i, n))
paus = xmgr.get_paus_from_document(document_id)
# The document id and number of PAUs are both integers. Cast them to strings, otherwise pandas will
# write them as floats.
for pau in paus:
corpus.write(pau["id"], pau["responseMarkup"], pau["title"], pau["sourceName"], str(document_id))
downloaded_document_ids.write(str(document_id), str(len(paus)))
finally:
downloaded_document_ids.close()
corpus.close()
corpus = from_csv(corpus_csv).drop_duplicates(ANSWER_ID)
to_csv(corpus_csv, CorpusFileType.output_format(corpus))
docs = len(from_csv(document_ids_csv))
os.remove(document_ids_csv)
logger.info("%d documents and %d PAUs in corpus" % (docs, len(corpus)))
开发者ID:DharmendraVaghela,项目名称:themis,代码行数:55,代码来源:xmgr.py
示例11: train_nlc
def train_nlc(url, username, password, truth, name):
logger.info("Train model %s with %d instances" % (name, len(truth)))
with tempfile.TemporaryFile() as training_file:
# NLC cannot handle newlines.
truth[QUESTION] = truth[QUESTION].str.replace("\n", " ")
to_csv(training_file, truth[[QUESTION, ANSWER_ID]], header=False, index=False)
training_file.seek(0)
nlc = NaturalLanguageClassifier(url=url, username=username, password=password)
r = nlc.create(training_data=training_file, name=name)
logger.info(pretty_print_json(r))
return r["classifier_id"]
开发者ID:DharmendraVaghela,项目名称:themis,代码行数:11,代码来源:nlc.py
示例12: nlc_router_test
def nlc_router_test(url, username, password, collate_file, path):
"""
Querying NLC for testing set to determine the system(NLC or Solr) and then lookup related
fields from collated file (used as an input to the oracle experiment)
:param url: URL of NLC instance
:param username: NLC Username
:param password: NLC password
:param oracle_out: file created by oracle experiment
:param collate_file: collated file created for oracle experiment as input
:param path: directory path to save intermediate results
:return: output file with best system NLC or Solr and relevant fields
"""
def log_correct(system_data, name):
n = len(system_data)
m = sum(system_data[CORRECT])
logger.info("%d of %d correct in %s (%0.3f%%)" % (m, n, name, 100.0 * m / n))
# import list of classifier from file
classifier_list = []
with open(os.path.join(path, 'classifier.json'), 'r') as f:
data = json.load(f)
for x in range(0, NLC_ROUTER_FOLDS):
classifier_list.append(data[x]['NLC+Solr Oracle_fold_{0}'.format(str(x))].encode("utf-8"))
for x in range(0, NLC_ROUTER_FOLDS):
test = pandas.read_csv(os.path.join(path, "Test{0}.csv".format(str(x))))
test = test[[QUESTION]]
test[QUESTION] = test[QUESTION].str.replace("\n", " ")
classifier_id = classifier_list[x]
n = NLC(url, username, password, classifier_id, test)
out_file = os.path.join(path, "Out{0}.csv".format(str(x)))
logger.info("Testing on fold {0} using NLC classifier {1}".format(str(x), str(classifier_list[x])))
answer_router_questions(n, set(test[QUESTION]), out_file)
# Concatenate multiple trained output into single csv file
dfList = []
columns = [QUESTION, SYSTEM]
for x in range(0, NLC_ROUTER_FOLDS):
df = pandas.read_csv(os.path.join(path, "Out{0}.csv".format(str(x))), header=0)
dfList.append(df)
concateDf = pandas.concat(dfList, axis=0)
concateDf.columns = columns
concateDf.to_csv(os.path.join(path, "Interim-Result.csv"), encoding='utf-8', index=None)
# Join operation to get fields from oracle collated file
result = pandas.merge(concateDf, collate_file, on=[QUESTION, SYSTEM])
result = result.rename(columns={SYSTEM: ANSWERING_SYSTEM})
result[SYSTEM] = 'NLC-as-router'
result[CONFIDENCE] = __standardize_confidence(result)
log_correct(result, 'NLC-as-router')
return result
开发者ID:cognitive-catalyst,项目名称:themis,代码行数:54,代码来源:analyze.py
示例13: fallback_combination
def fallback_combination(systems_data, default_system, secondary_system):
"""
Combine results from two systems into a single fallback system. The default system will answer the question if
the confidence is above a certain threshold. This method will find the optimal confidence threshold.
:param systems_data: collated results for the input systems
:type systems_data: pandas.DataFrame
:param default_system: the name of the default system (if confidence > t)
:type default_system: str
:param secondary_system: the name of the fallback system (if default_confidence < t)
:type secondary_system: str
:return: Fallback results in collated format
:rtype: pandas.DataFrame
"""
systems_data = drop_missing(systems_data)
default_system_data = systems_data[systems_data[SYSTEM] == default_system]
secondary_system_data = systems_data[systems_data[SYSTEM] == secondary_system]
intersecting_questions = set(default_system_data[QUESTION]).intersection(set(secondary_system_data[QUESTION]))
logger.warn("{0} questions in default system".format(len(default_system_data)))
logger.warn("{0} questions in secondary system".format(len(secondary_system_data)))
logger.warn("{0} questions in overlapping set".format(len(intersecting_questions)))
default_system_data = default_system_data[default_system_data[QUESTION].isin(intersecting_questions)]
secondary_system_data = secondary_system_data[secondary_system_data[QUESTION].isin(intersecting_questions)]
unique_confidences = default_system_data[CONFIDENCE].unique()
best_threshold, best_precision = 0, 0
for threshold in unique_confidences:
combined_system = _create_combined_fallback_system_at_threshold(default_system_data, secondary_system_data, threshold)
system_precision = precision(combined_system, 0)
if system_precision > best_precision:
best_precision = system_precision
best_threshold = threshold
logger.info("Default system accuracy: {0}%".format(str(precision(default_system_data, 0) * 100)[:4]))
logger.info("Secondary system accuracy: {0}%".format(str(precision(secondary_system_data, 0) * 100)[:4]))
logger.info("Combined system accuracy: {0}%".format(str(best_precision * 100)[:4]))
logger.info("Combined system best threshold: {0}".format(best_threshold))
best_system = _create_combined_fallback_system_at_threshold(default_system_data, secondary_system_data, best_threshold)
best_system[ANSWERING_SYSTEM] = best_system[SYSTEM]
best_system[SYSTEM] = "{0}_FALLBACK_{1}_AT_{2}".format(default_system, secondary_system, str(best_threshold)[:4])
logger.info("Questions answered by {0}: {1}%".format(default_system, str(100 * float(len(best_system[best_system[ANSWERING_SYSTEM] == default_system])) / len(best_system))[:4]))
best_system[CONFIDENCE] = __standardize_confidence(best_system)
return best_system
开发者ID:cognitive-catalyst,项目名称:themis,代码行数:53,代码来源:analyze.py
示例14: __call__
def __call__(self, filename):
if os.path.isfile(filename):
collated = super(self.__class__, self).__call__(filename)
m = sum(collated[collated[IN_PURVIEW] == False][CORRECT])
if m:
n = len(collated)
logger.warning(
"%d out of %d question/answer pairs in %s are marked as out of purview but correct (%0.3f%%)"
% (m, n, filename, 100.0 * m / n))
return collated
else:
logger.info("{0} does not exist".format(filename))
return None
开发者ID:cognitive-catalyst,项目名称:themis,代码行数:13,代码来源:analyze.py
示例15: filter_usage_log_by_user_experience
def filter_usage_log_by_user_experience(usage_log, disallowed):
"""
Only retain questions whose 'user experience' value does not appear on a blacklist.
:param usage_log: QuestionsData.csv report log
:type usage_log: pandas.DataFrame
:param disallowed: set of disallowed 'user experience' values
:type disallowed: enumerable set of str
:return: usage log with questions removed
:rtype: pandas.DataFrame
"""
n = len(usage_log)
usage_log = usage_log[~usage_log[USER_EXPERIENCE].isin(disallowed)]
logger.info("Removed %d questions with user experience '%s'" % ((n - len(usage_log)), ",".join(disallowed)))
return usage_log
开发者ID:louisroehrs,项目名称:themis,代码行数:15,代码来源:fixup.py
示例16: extract_handler
def extract_handler(args):
# Do custom fixup of usage logs.
usage_log = pandas.concat(args.usage_log)
n = len(usage_log)
if args.before or args.after:
usage_log = filter_usage_log_by_date(usage_log, args.before, args.after)
user_experience = set(args.user_experience) | {"DIALOG"} # DIALOG is always disallowed
usage_log = filter_usage_log_by_user_experience(usage_log, user_experience)
if args.deakin:
usage_log = deakin(usage_log)
m = n - len(usage_log)
if n:
logger.info("Removed %d of %d questions (%0.3f%%)" % (m, n, 100.0 * m / n))
# Extract Q&A pairs from fixed up usage logs.
qa_pairs = extract_question_answer_pairs_from_usage_logs(usage_log)
print_csv(QAPairFileType.output_format(qa_pairs))
开发者ID:ManaliChanchlani,项目名称:themis,代码行数:16,代码来源:main.py
示例17: compare_systems
def compare_systems(systems_data, x, y, comparison_type):
"""
On which questions did system x do better or worse than system y?
System x did better than system y if it correctly answered a question when system y did not, and vice versa.
:param systems_data: collated results for all systems
:type systems_data: pandas.DataFrame
:param x: system name
:type x: str
:param y: system name
:type y: str
:param comparison_type: "better" or "worse"
:type comparison_type: str
:return: all question/answer pairs from system x that were either better or worse than system y
:rtype: pandas.DataFrame
"""
def col_name(type, system):
return type + " " + system
systems_data = drop_missing(systems_data)
systems_data = systems_data[systems_data[IN_PURVIEW]]
data_x = systems_data[systems_data[SYSTEM] == x]
data_y = systems_data[systems_data[SYSTEM] == y][[QUESTION, ANSWER, CONFIDENCE, CORRECT]]
questions = pandas.merge(data_x, data_y, on=QUESTION, how="left", suffixes=(" " + x, " " + y)).dropna()
n = len(questions)
logger.info("%d shared question/answer pairs between %s and %s" % (n, x, y))
x_correct = col_name(CORRECT, x)
y_correct = col_name(CORRECT, y)
if comparison_type == "better":
a = questions[x_correct] == True
b = questions[y_correct] == False
elif comparison_type == "worse":
a = questions[x_correct] == False
b = questions[y_correct] == True
else:
raise ValueError("Invalid comparison type %s" % comparison_type)
d = questions[a & b]
m = len(d)
logger.info("%d %s (%0.3f%%)" % (m, comparison_type, 100.0 * m / n))
d = d[[QUESTION, FREQUENCY,
col_name(ANSWER, x), col_name(CONFIDENCE, x), col_name(ANSWER, y), col_name(CONFIDENCE, y)]]
d = d.sort_values([col_name(CONFIDENCE, x), FREQUENCY, QUESTION], ascending=(False, False, True))
return d.set_index(QUESTION)
开发者ID:louisroehrs,项目名称:themis,代码行数:45,代码来源:analyze.py
示例18: augment_usage_log
def augment_usage_log(usage_log, judgments):
"""
Add In Purview and Annotation Score information to system usage log.
:param usage_log: user interaction logs from QuestionsData.csv XMGR report
:type usage_log: pandas.DataFrame
:param judgments: judgments
:type judgments: pandas.DataFrame
:return: user interaction logs with additional columns
:rtype: pandas.DataFrame
"""
usage_log = usage_log.rename(columns={QUESTION_TEXT: QUESTION, TOP_ANSWER_TEXT: ANSWER})
augmented = pandas.merge(usage_log, judgments, on=(QUESTION, ANSWER), how="left")
n = len(usage_log[[QUESTION, ANSWER]].drop_duplicates())
if n:
m = len(judgments)
logger.info("%d unique question/answer pairs, %d judgments (%0.3f%%)" % (n, m, 100.0 * m / n))
return augmented.rename(columns={QUESTION: QUESTION_TEXT, ANSWER: TOP_ANSWER_TEXT})
开发者ID:DharmendraVaghela,项目名称:themis,代码行数:18,代码来源:judge.py
示例19: filter_corpus
def filter_corpus(corpus, max_size):
"""
Remove corpus entries above a specified size.
:param corpus: corpus with Answer Id and Answer columns
:type corpus: pandas.DataFrame
:param max_size: maximum allowed Answer size in characters
:type max_size: int
:return: corpus with oversize answers removed
:rtype: pandas.DataFrame
"""
if max_size is not None:
filtered = corpus[corpus[ANSWER].str.len() <= max_size]
n = len(corpus)
if n:
m = n - len(filtered)
logger.info("Filtered %d of %d answers over size %d (%0.3f%%)" % (m, n, max_size, 100.0 * m / n))
corpus = filtered
return corpus.set_index(ANSWER_ID)
开发者ID:cognitive-catalyst,项目名称:themis,代码行数:19,代码来源:fixup.py
示例20: get_answers_from_usage_log
def get_answers_from_usage_log(questions, qa_pairs_from_logs):
"""
Get answers returned by WEA to questions by looking them up in the usage log.
Each question in the Q&A pairs must have a unique answer.
:param questions: questions to look up in the usage logs
:type questions: pandas.DataFrame
:param qa_pairs_from_logs: question/answer pairs extracted from user logs
:type qa_pairs_from_logs: pandas.DataFrame
:return: Question, Answer, and Confidence
:rtype: pandas.DataFrame
"""
answers = pandas.merge(questions, qa_pairs_from_logs, on=QUESTION, how="left")
missing_answers = answers[answers[ANSWER].isnull()]
if len(missing_answers):
logger.warning("%d questions without answers" % len(missing_answers))
logger.info("Answered %d questions" % len(answers))
answers = answers[[QUESTION, ANSWER, CONFIDENCE]].sort_values([QUESTION, CONFIDENCE], ascending=[True, False])
return answers.set_index(QUESTION)
开发者ID:DharmendraVaghela,项目名称:themis,代码行数:20,代码来源:answer.py
注:本文中的themis.logger.info函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论