本文整理汇总了Python中tika.parser.from_file函数的典型用法代码示例。如果您正苦于以下问题:Python from_file函数的具体用法?Python from_file怎么用?Python from_file使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了from_file函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: computeScores
def computeScores(inputDir, outCSV, acceptTypes):
with open(outCSV, "wb") as outF:
a = csv.writer(outF, delimiter=',')
a.writerow(["x-coordinate","y-coordinate","Similarity_score"])
files_tuple = itertools.combinations(filterFiles(inputDir, acceptTypes), 2)
for file1, file2 in files_tuple:
try:
row_cosine_distance = [file1, file2]
file1_parsedData = parser.from_file(file1)
file2_parsedData = parser.from_file(file2)
v1 = Vector(file1, ast.literal_eval(file1_parsedData["content"]))
v2 = Vector(file2, ast.literal_eval(file2_parsedData["content"]))
row_cosine_distance.append(v1.cosTheta(v2))
a.writerow(row_cosine_distance)
except ConnectionError:
sleep(1)
except KeyError:
continue
except Exception, e:
pass
开发者ID:jainn3,项目名称:ContentDetection,代码行数:27,代码来源:cosine_similarity.py
示例2: run_exit_tool_on_known_type
def run_exit_tool_on_known_type(dir_list):
file_list = get_file_list(dir_list)
for entry in file_list:
parser.from_file(entry)
return
开发者ID:fysteven,项目名称:csci599,代码行数:7,代码来源:yaoexiftool.py
示例3: computeScores
def computeScores(inputDir, outCSV, acceptTypes, allKeys):
na_metadata = ["resourceName"]
with open(outCSV, "wb") as outF:
a = csv.writer(outF, delimiter=',')
a.writerow(["x-coordinate","y-coordinate","Similarity_score"])
filename_list = []
for root, dirnames, files in os.walk(inputDir):
dirnames[:] = [d for d in dirnames if not d.startswith('.')]
for filename in files:
if not filename.startswith('.'):
filename_list.append(os.path.join(root, filename))
filename_list = [filename for filename in filename_list if parser.from_file(filename)]
if acceptTypes:
filename_list = [filename for filename in filename_list if str(parser.from_file(filename)['metadata']['Content-Type'].encode('utf-8')).split('/')[-1] in acceptTypes]
else:
print "Accepting all MIME Types....."
files_tuple = itertools.combinations(filename_list, 2)
for file1, file2 in files_tuple:
row_edit_distance = [file1, file2]
file1_parsedData = parser.from_file(file1)
file2_parsedData = parser.from_file(file2)
intersect_features = set(file1_parsedData["metadata"].keys()) & set(file2_parsedData["metadata"].keys())
intersect_features = [feature for feature in intersect_features if feature not in na_metadata ]
file_edit_distance = 0.0
for feature in intersect_features:
file1_feature_value = stringify(file1_parsedData["metadata"][feature])
file2_feature_value = stringify(file2_parsedData["metadata"][feature])
feature_distance = float(editdistance.eval(file1_feature_value, file2_feature_value))/(len(file1_feature_value) if len(file1_feature_value) > len(file2_feature_value) else len(file2_feature_value))
file_edit_distance += feature_distance
if allKeys:
file1_only_features = set(file1_parsedData["metadata"].keys()) - set(intersect_features)
file1_only_features = [feature for feature in file1_only_features if feature not in na_metadata]
file2_only_features = set(file2_parsedData["metadata"].keys()) - set(intersect_features)
file2_only_features = [feature for feature in file2_only_features if feature not in na_metadata]
file_edit_distance += len(file1_only_features) + len(file2_only_features)
file_edit_distance /= float(len(intersect_features) + len(file1_only_features) + len(file2_only_features))
else:
file_edit_distance /= float(len(intersect_features)) #average edit distance
row_edit_distance.append(1-file_edit_distance)
a.writerow(row_edit_distance)
开发者ID:harsham05,项目名称:edit-distance-similarity,代码行数:58,代码来源:edit-value-similarity.py
示例4: command
def command(in_dir, out_dir, tika_server):
create_dirs(out_dir)
in_files = get_files(in_dir)
for fi in in_files:
if tika_server:
parsed = parser.from_file(fi, tika_server)
else:
parsed = parser.from_file(fi)
out_file = out_file_name(out_dir, fi, 'txt')
with codecs.open(out_file, 'wb', encoding='utf-8') as f:
f.write(parsed['content'])
开发者ID:WhatWorksWhenForWhom,项目名称:nlpppln,代码行数:14,代码来源:apachetika.py
示例5: intersect
def intersect(json_filename, output_name, index_file, start_index=0, end_index=yaoner.MAX_INT_VALUE):
base_directory = '/Users/Frank/Desktop/fulldump/raw-dataset/'
if index_file is None:
index_file = '/Users/Frank/PycharmProjects/599assignment1/geo-topic-parser-folder/geo-topic-all-files.txt'
with open(json_filename) as json_file:
json_data = json.load(json_file)
concept_dictionary = dict()
for key in json_data.keys():
concept_dictionary[key.lower()] = {}
file_list = yaoner.read_index_file(index_file, base_directory, start_index, end_index)
for idx, val in enumerate(file_list):
print(start_index + idx)
parsed = parser.from_file(''.join([base_directory, val]))
if 'content' in parsed and parsed['content'] is not None:
content = parsed['content']
words = content.split()
for word in words:
lowercased = word.lower()
if lowercased in concept_dictionary:
last_part = os.path.basename(val)
concept_dictionary[lowercased][last_part] = 1
dump(concept_dictionary, output_name + 'from' + str(start_index) + 'to' + str(end_index) + '.json')
return
开发者ID:fysteven,项目名称:csci599,代码行数:29,代码来源:sweetparser.py
示例6: extract
def extract(path):
parsed = parser.from_file(path)
content = parsed["content"]
ners = StanfordExtractor(content).extract()
entities = CustomEntityExtractor(content).extract()
quantities = QuantityExtractor(content).getQuantities()
if len(ners['LOCATION']) > 0:
l = GeoTopic(map(lambda l: l['name'], ners['LOCATION']))
geo = l.getInfo()
locations = l.getLocations()
else:
geo = [ ]
locations = [ ]
return {
'geo' : geo,
'locations' : locations,
'entities': entities['entities'],
'places': ners['LOCATION'],
'dates': ners['DATE'],
'quantities': quantities,
'metadata': parsed['metadata'],
'mime-type': parsed['metadata']['Content-Type'],
'id': idf.set(path)
}
开发者ID:ml-lab,项目名称:polar-deep-insights,代码行数:27,代码来源:main.py
示例7: run_ner
def run_ner(start_index=0, end_index=MAX_INT_VALUE):
index_file = '/Users/Frank/PycharmProjects/599assignment1/geo-topic-parser-folder/geo-topic-all-files.txt'
base_directory = '/Users/Frank/Desktop/fulldump/raw-dataset/'
file_list = read_index_file(index_file, base_directory, start_index, end_index)
measurement_list = []
index = 0 + start_index
for entry in file_list:
print(index)
parsed = parser.from_file(''.join([base_directory, entry]))
if 'metadata' in parsed:
if 'X-TIKA:EXCEPTION:embedded_exception' in parsed['metadata']:
index += 1
continue
if 'content' in parsed:
if parsed['content'] is not None:
# print(json.dumps(parsed['metadata'], indent=4))
# print(parsed['content'])
# print('content size ', len(parsed['content']))
if len(parsed['content']) > 1 * 1024 * 1024:
index += 1
continue
measurements = extract_measurement(parsed['content'])
if measurements is not None and len(measurements) > 0:
measurement_list.append({entry.split('/')[-1]: measurements})
index += 1
dump_to_json(measurement_list, '/Users/Frank/working-directory/ner-measurement-mentions/',
'from' + str(start_index) + 'to' + str(end_index))
return
开发者ID:fysteven,项目名称:csci599,代码行数:30,代码来源:yaoner.py
示例8: filterFiles
def filterFiles(inputDir, acceptTypes):
filename_list = []
for root, dirnames, files in os.walk(inputDir):
dirnames[:] = [d for d in dirnames if not d.startswith('.')]
for filename in files:
if not filename.startswith('.'):
filename_list.append(os.path.join(root, filename))
filename_list = [filename for filename in filename_list if parser.from_file(filename)]
if acceptTypes:
filename_list = [filename for filename in filename_list if str(parser.from_file(filename)['metadata']['Content-Type'].encode('utf-8')).split('/')[-1] in acceptTypes]
else:
print "Accepting all MIME Types....."
return filename_list
开发者ID:harshfatepuria,项目名称:Scientific-Content-Enrichment-in-the-Text-Retrieval-Conference-TREC-Polar-Dynamic-Domain-Dataset,代码行数:16,代码来源:kmeans_ext.py
示例9: compareValueSimilarity
def compareValueSimilarity (fileDir, encoding = 'utf-8') :
union_feature_names = set()
file_parsed_data = {}
resemblance_scores = {}
file_metadata={}
for filename in fileDir:
file_parsed = []
parsedData = parser.from_file(filename)
file_metadata[filename] = parsedData["metadata"]
for key in parsedData["metadata"].keys() :
value = parsedData["metadata"].get(key)[0]
if isinstance(value, list):
value = ""
for meta_value in parsedData["metadata"].get(key)[0]:
value += meta_value
file_parsed.append(str(key.strip(' ').encode(encoding) + ": " + value.strip(' ').encode(encoding)))
file_parsed_data[filename] = set(file_parsed)
union_feature_names = union_feature_names | set(file_parsed_data[filename])
total_num_features = len(union_feature_names)
for filename in file_parsed_data.keys():
overlap = {}
overlap = file_parsed_data[filename] & set(union_feature_names)
resemblance_scores[filename] = float(len(overlap))/total_num_features
sorted_resemblance_scores = sorted(resemblance_scores.items(), key=operator.itemgetter(1), reverse=True)
return sorted_resemblance_scores, file_metadata
开发者ID:harsham05,项目名称:etllib,代码行数:32,代码来源:etllib.py
示例10: load_topics
def load_topics(filename):
languages.append(language.from_file(filename))
parser_obj = parser.from_file(filename)
if 'content' in parser_obj and parser_obj['content']:
words.extend(get_nouns(parser_obj['content']))
if 'metadata' in parser_obj:
metadata_dict = parser_obj['metadata']
if 'Author' in metadata_dict:
if type(metadata_dict['Author']) == type([]):
metadata.append(metadata_dict['Author'][0])
else:
metadata.append(metadata_dict['Author'])
if 'xmp:CreatorTool' in metadata_dict:
if type(metadata_dict['xmp:CreatorTool']) == type([]):
metadata.extend(metadata_dict['xmp:CreatorTool'])
else:
metadata.append(metadata_dict['xmp:CreatorTool'])
if 'Content-Type' in metadata_dict:
if type(metadata_dict['Content-Type']) == type([]):
metadata.append(metadata_dict['Content-Type'][0])
else:
metadata.append(metadata_dict['Content-Type'])
if 'Company' in metadata_dict:
if type(metadata_dict['Company']) == type([]):
metadata.append(metadata_dict['Company'][0])
else:
metadata.append(metadata_dict['Company'])
开发者ID:durgaravi,项目名称:polardata-analysis-3,代码行数:29,代码来源:wordcloud.py
示例11: __init__
def __init__(self, fileName):
parsed = parser.from_file(fileName)
metadata = parsed["metadata"]
# Return re.sub('[\s+]', '', content)
# TODO: Delete... Very Redundant..
content = parsed["content"]
content = content.replace('\n', '')
content = content.replace('\t', '')
content = content.replace('\'', '')
content = content.replace('\"', '')
rx = re.compile('\W+')
content = rx.sub(' ', content).strip()
self.content = content
# Title...
try:
title = metadata['title']
except:
title = 'Untitled'
title = title.replace('\t', '')
title = title.replace('\t', '')
title = title.replace('\'', '')
title = title.replace('\"', '')
title = rx.sub(' ', title).strip()
self.title = title
# self.type = self.metadata['Content-Type-Hint']
# self.name = self.metadata['resourceName']
# lanFix = re.sub('[\s+]', '', content)
self.lang = language.from_file(fileName)
开发者ID:intfrr,项目名称:darksearch,代码行数:28,代码来源:tk.py
示例12: getKeywords
def getKeywords(pdfFile,Occur):
tikaurl= tika_obo.getTikaAddress()
parsed = parser.from_file(pdfFile, tikaurl)
metadata = parsed["metadata"]
doccontent = parsed["content"]
fullwordlist = obo.stripNonAlphaNum(doccontent)
wordlist = obo.removeStopwords(fullwordlist, obo.stopwords)
dictionary = obo.wordListToFreqDict(wordlist)
sorteddict = obo.sortFreqDict(dictionary)
count = 0
keywords = []
shortkey = []
maxoccur = Occur
for s in sorteddict:
numocc = int(s[0])
word = s[1].encode('utf-8')
if numocc > maxoccur:
keyword = { word : str(numocc) }
keywords.append(keyword)
if len(word)>6:
shortkey.append(word.lower())
count = count + 1
if Occur > 0:
return shortkey
return keywords
开发者ID:digcat,项目名称:cmistest,代码行数:28,代码来源:keywords.py
示例13: main
def main(file_name):
fi = open("sentences.txt", "w+")
fi_summary = open("summary.txt", "w+")
fi_cool = open("wtv.txt", "w+")
score_sentences = SentenceScores()
parsed = parser.from_file(file_name)
print parsed["metadata"]
content = parsed["content"]
content = content.strip()
fi_cool.write(content.encode("utf-8"))
sentences = content.split(". ")
sentences = map(clean_sentence, sentences)
lines = score_sentences.get_summary_lines(sentences)
max_len = len(lines) / 3
needed_lines = lines[0:max_len]
sorted_lines = sorted(needed_lines, key=lambda x: x[0])
for line_num, score in sorted_lines:
fi_summary.write((str(line_num+1)+", "+sentences[line_num]).encode("utf-8"))
for sentence in sentences:
fi.write(sentence.encode("utf-8"))
fi.close()
fi_summary.close()
开发者ID:dragon-fury,项目名称:summary-ly,代码行数:27,代码来源:pdf_tika_parser.py
示例14: getTikaTags
def getTikaTags(filename):
import tika
from tika import parser
import obo
import tika_obo
import gethavens
tikaUrl = getTikaAddress()
parsed = parser.from_file(filename, tikaUrl)
metadata = parsed["metadata"]
content = parsed["content"]
jsonprops = {'cm:title': str(metadata['resourceName'])}
for key in metadata:
newkey = str(key)
value = str(metadata[key])
jsonprops[newkey] = value
title = jsonprops['resourceName']
namebreak = title.split('.')
havenrecord = gethavens.getPropertiesHaven(str(jsonprops['resourceName']))
jsonprops['Description'] = 'Ranked:' + str(havenrecord['rank']) \
+ ' most secretive Tax Haven\nhttps://www.google.co.uk/maps/place/' \
+ havenrecord['country']
jsonprops['Name'] = havenrecord['country']
jsonprops['cmis:title'] = str(title)
jsonprops['cmis:author'] = 'admin'
return jsonprops
开发者ID:digcat,项目名称:cmistest,代码行数:28,代码来源:tika_obo.py
示例15: _request_pdf_data
def _request_pdf_data(self, url):
parsed = parser.from_file(url)
return {
'url': url,
'title': self._parse_pdf_title(parsed),
'body': self._parse_pdf_body(parsed)
}
开发者ID:takeshi0406,项目名称:twlist_to_urllist,代码行数:7,代码来源:twlist_to_urllist.py
示例16: search_content
def search_content(file_path, expressions):
"""Open a file and search it's contents against a set of RegEx."""
matches = []
count = 0
data = parser.from_file(file_path)
# Read into an I/O buffer for better readline support
if not data:
# There is no content that could be extracted
return matches
content = io.StringIO(data['content'])
# TODO this may create a very large buffer for larger files
# We may need to convert this to a while readline() loop
for line in content.readlines():
count += 1 # count the number of lines
if line:
for rex in expressions:
# Check if the line matches all the expressions
res = rex.regex.search(line)
if res:
# If there's a match append to the list
matches.append(cazobjects.CazRegMatch(res,
file_path,
count,
rex.name))
return matches
开发者ID:DataGravityInc,项目名称:cazador,代码行数:26,代码来源:cazscan.py
示例17: file_parser
def file_parser(fname, pages=None):
if magic.from_file(fname, mime=True) == 'application/pdf':
try:
text_array = []
i = 0
d = pdf.Document(fname)
for i, p in enumerate(d, start=1):
for f in p:
for b in f:
for l in b:
text_array.append(l.text.encode('UTF-8'))
if i >= pages: # break after x pages
break
log.debug("Processed %i pages (%i max)", i, pages)
return '\n'.join(text_array)
except:
# reraise everything
raise
else:
try:
content = parser.from_file(fname)['content']
return (content or '').encode('UTF-8')
except:
# reraise everything
raise
开发者ID:openstate,项目名称:open-raadsinformatie,代码行数:27,代码来源:file_parsing.py
示例18: makeSearchable
def makeSearchable(self, src, subdir):
rootDir = subdir + "/examplePDFs"
pdfPath = rootDir + "/" + "rawPdfs"
finishedTextPath = rootDir + "/" + "finishedText"
removed_text_path = rootDir + "/" + "removedText"
gsPath = rootDir + "/" + "gsPdfs"
imagesProcessedPath = rootDir + "/" + "imagesProcessed"
imageText = rootDir + "/" + "imageText"
if not os.path.exists(pdfPath):
os.makedirs(pdfPath)
if not os.path.exists(finishedTextPath):
os.makedirs(finishedTextPath)
if not os.path.exists(removed_text_path):
os.makedirs(removed_text_path)
if not os.path.exists(gsPath):
os.makedirs(gsPath)
if not os.path.exists(imagesProcessedPath):
os.makedirs(imagesProcessedPath)
if not os.path.exists(imageText):
os.makedirs(imageText)
filename, fileType = src.rsplit(".", 1)
print("\n**********************")
print("Processing file: " + filename)
print("**********************\n")
# Extact easy text
print("Getting text that can be easily extracted...")
rawText = parser.from_file(pdfPath + "/" + src)
if rawText["content"] is None:
print("Found no text to extract, continuing process")
else:
fileOutput = open(finishedTextPath + "/" + filename + ".txt", 'w')
fileOutput.write(rawText["content"].encode("utf-8"))
fileOutput.close()
# Remove text from pdf
print("Removing text from pdf")
process1 = subprocess.Popen(['java', '-jar', 'PdfTextDeleter.jar', src, os.path.join(removed_text_path, src)])
process1.wait()
# Apply ghostscript to removed text pdfs
if not os.path.exists(gsPath + "/" + filename + "-imgs"):
os.makedirs(gsPath + "/" + filename + "-imgs")
if not os.path.exists(rootDir + "/imagesProcessed/" + filename + "-imgs"):
os.makedirs(rootDir + "/imagesProcessed/" + filename + "-imgs")
if not os.path.exists(rootDir + "/imageText/" + filename + "-imgs"):
os.makedirs(rootDir + "/imageText/" + filename + "-imgs")
print("Converting left over pdf to images")
process2 = subprocess.Popen(["gs", "-dNOPAUSE", "-sFONTPATH=/opt/local/share/ghostscript/9.16/Resource/Font/",
"-sDEVICE=pngalpha", "-r300", "-dBATCH", "-sOutputFile=" + gsPath + "/" + filename + "-imgs" + "/" + filename + "-%03d" ".png",
removed_text_path + "/" + src], env={'PATH': '/opt/local/bin/'})
process2.wait()
self.preprocessImages(rootDir, subdir, src)
self.applyOCRToImages(rootDir, subdir, src)
self.mergeTextFiles(rootDir, subdir, src)
开发者ID:RandalMoss,项目名称:pdf-search,代码行数:57,代码来源:pdf_text_extractor.py
示例19: parse_file
def parse_file(self, path):
"""
Parses a file at given path
:param path: path to file
:return: parsed content
"""
parsed = tkparser.from_file(path)
parsed['file'] = os.path.abspath(path)
return parsed
开发者ID:chrismattmann,项目名称:parser-indexer-py,代码行数:9,代码来源:parser.py
示例20: computeScores
def computeScores(inputDir, outCSV, acceptTypes):
with open(outCSV, "wb") as outF:
a = csv.writer(outF, delimiter=',')
a.writerow(["x-coordinate","y-coordinate","Similarity_score"])
files_tuple = itertools.combinations(filterFiles(inputDir, acceptTypes), 2)
for file1, file2 in files_tuple:
row_cosine_distance = [file1, file2]
file1_parsedData = parser.from_file(file1)
file2_parsedData = parser.from_file(file2)
v1 = Vector(file1_parsedData["metadata"])
v2 = Vector(file2_parsedData["metadata"])
row_cosine_distance.append(v1.cosTheta(v2))
a.writerow(row_cosine_distance)
开发者ID:harsham05,项目名称:edit-distance-similarity,代码行数:20,代码来源:cosine_similarity.py
注:本文中的tika.parser.from_file函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论