本文整理汇总了Python中messytables.headers_processor函数的典型用法代码示例。如果您正苦于以下问题:Python headers_processor函数的具体用法?Python headers_processor怎么用?Python headers_processor使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了headers_processor函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: convert
def convert(self):
table_set = CSVTableSet.from_fileobj(self.stream)
row_set = table_set.tables.pop()
offset, headers = headers_guess(row_set.sample)
fields = []
dup_columns = {}
noname_count = 1
for index, field in enumerate(headers):
field_dict = {}
if "" == field:
field = '_'.join(['column', str(noname_count)])
headers[index] = field
noname_count += 1
if headers.count(field) == 1:
field_dict['id'] = field
else:
dup_columns[field] = dup_columns.get(field, 0) + 1
field_dict['id'] = u'_'.join([field, str(dup_columns[field])])
fields.append(field_dict)
row_set.register_processor(headers_processor([x['id'] for x in fields]))
row_set.register_processor(offset_processor(offset + 1))
data_row = {}
result = []
for row in row_set:
for index, cell in enumerate(row):
data_row[cell.column] = cell.value
result.append(data_row)
return fields, result
开发者ID:Big-Data,项目名称:data-converters,代码行数:31,代码来源:csv_json_converter.py
示例2: get_schema
def get_schema(self, filename):
"""
Guess schema using messytables
"""
table_set = self.read_file(filename)
# Have I been able to read the filename
if table_set is None:
return []
# Get the first table as rowset
row_set = table_set.tables[0]
offset, headers = headers_guess(row_set.sample)
row_set.register_processor(headers_processor(headers))
row_set.register_processor(offset_processor(offset + 1))
types = type_guess(row_set.sample, strict=True)
# Get a sample as well..
sample = next(row_set.sample)
clean = lambda v: str(v) if not isinstance(v, str) else v
schema = []
for i, h in enumerate(headers):
schema.append([h,
str(types[i]),
clean(sample[i].value)])
return schema
开发者ID:purnima215,项目名称:dgit,代码行数:29,代码来源:tableformat.py
示例3: generate_table
def generate_table(self, document, meta, sheet, row_set):
offset, headers = headers_guess(row_set.sample)
row_set.register_processor(headers_processor(headers))
row_set.register_processor(offset_processor(offset + 1))
tabular = self.create_tabular(sheet, row_set.name)
columns = [tabular.add_column(h) for h in headers]
if not len(columns):
return
def generate_rows():
for i, row in enumerate(row_set):
record = {}
try:
for cell, column in zip(row, columns):
record[column.name] = string_value(cell.value)
if len(record):
for column in columns:
record[column.name] = record.get(column.name, None)
yield record
except Exception as exception:
log.warning("Could not decode row %s in %s: %s",
i, meta, exception)
document.insert_records(sheet, generate_rows())
return tabular
开发者ID:CodeForAfrica,项目名称:aleph,代码行数:25,代码来源:tabular.py
示例4: generate_table
def generate_table(self, meta, sheet, row_set):
offset, headers = headers_guess(row_set.sample)
row_set.register_processor(headers_processor(headers))
row_set.register_processor(offset_processor(offset + 1))
schema = TabularSchema({
'sheet_name': row_set.name,
'content_hash': meta.content_hash,
'sheet': sheet
})
columns = [schema.add_column(h) for h in headers]
log.info("Creating internal table: %s columns, table: %r", len(columns),
schema.table_name)
tabular = Tabular(schema)
tabular.drop()
tabular.create()
def generate_rows():
for i, row in enumerate(row_set):
record = {}
for cell, column in zip(row, columns):
record[column.name] = string_value(cell.value)
if len(record):
for column in columns:
record[column.name] = record.get(column.name, None)
yield record
log.info("Loaded %s rows.", i)
tabular.load_iter(generate_rows())
return schema
开发者ID:DavidLemayian,项目名称:aleph,代码行数:29,代码来源:tabular.py
示例5: main
def main(argv=None):
args = parse_args(argv)
if args.file is None:
# slurp the whole input since there seems to be a bug in messytables
# which should be able to handle streams but doesn't
args.file = cStringIO.StringIO(sys.stdin.read())
relation_key = args_to_relation_key(args)
table_set = any_tableset(args.file)
if len(table_set.tables) != 1:
raise ValueError("Can only handle files with a single table, not %s" % len(table_set.tables))
row_set = table_set.tables[0]
# guess header names and the offset of the header:
offset, headers = headers_guess(row_set.sample)
row_set.register_processor(strip_processor())
row_set.register_processor(headers_processor(headers))
# Temporarily, mark the offset of the header
row_set.register_processor(offset_processor(offset + 1))
# guess types and register them
types = type_guess(replace_empty_string(row_set.sample), strict=True, types=[StringType, DecimalType, IntegerType])
row_set.register_processor(types_processor(types))
# Messytables seems to not handle the case where there are no headers.
# Work around this as follows:
# 1) offset must be 0
# 2) if the types of the data match the headers, assume there are
# actually no headers
if offset == 0:
try:
[t.cast(v) for (t, v) in zip(types, headers)]
except:
pass
else:
# We don't need the headers_processor or the offset_processor
row_set._processors = []
row_set.register_processor(strip_processor())
row_set.register_processor(types_processor(types))
headers = None
# Construct the Myria schema
schema = messy_to_schema(types, headers)
logging.info("Myria schema: {}".format(json.dumps(schema)))
# Prepare data for writing to Myria
data, kwargs = write_data(row_set, schema)
if not args.dry:
# Connect to Myria and send the data
connection = myria.MyriaConnection(hostname=args.hostname, port=args.port, ssl=args.ssl)
ret = connection.upload_file(relation_key, schema, data, args.overwrite, **kwargs)
sys.stdout.write(pretty_json(ret))
else:
sys.stdout.write(data)
开发者ID:helgag,项目名称:myria-python,代码行数:59,代码来源:upload_file.py
示例6: test_guess_headers
def test_guess_headers(self):
fh = horror_fobj("weird_head_padding.csv")
table_set = CSVTableSet(fh)
row_set = table_set.tables[0]
offset, headers = headers_guess(row_set.sample)
row_set.register_processor(headers_processor(headers))
row_set.register_processor(offset_processor(offset + 1))
data = list(row_set)
assert "Frauenheilkunde" in data[9][0].value, data[9][0].value
fh = horror_fobj("weird_head_padding.csv")
table_set = CSVTableSet(fh)
row_set = table_set.tables[0]
row_set.register_processor(headers_processor(["foo", "bar"]))
data = list(row_set)
assert "foo" in data[12][0].column, data[12][0]
assert "Chirurgie" in data[12][0].value, data[12][0].value
开发者ID:bearrito,项目名称:messytables,代码行数:17,代码来源:test_read.py
示例7: lines
def lines(self):
fh = urlopen(self.source.url)
row_set = CSVRowSet('data', fh, window=3)
headers = list(row_set.sample)[0]
headers = [c.value for c in headers]
row_set.register_processor(headers_processor(headers))
row_set.register_processor(offset_processor(1))
for row in row_set:
yield dict([(c.column, c.value) for c in row])
开发者ID:fucc1,项目名称:FPA_Core,代码行数:9,代码来源:__init__.py
示例8: test_read_encoded_characters_csv
def test_read_encoded_characters_csv(self):
fh = horror_fobj('characters.csv')
table_set = CSVTableSet(fh)
row_set = table_set.tables[0]
offset, headers = headers_guess(row_set.sample)
row_set.register_processor(headers_processor(headers))
row_set.register_processor(offset_processor(offset + 1))
data = list(row_set)
assert_equal(382, len(data))
assert_equal(data[0][2].value, u'雲嘉南濱海國家風景區管理處')
assert_equal(data[-1][2].value, u'沈光文紀念廳')
开发者ID:ahlusar1989,项目名称:messytables,代码行数:11,代码来源:test_read.py
示例9: parse
def parse(stream, excel_type='xls', sheet=1, guess_types=True, **kwargs):
'''Parse Excel (xls or xlsx) to structured objects.
:param excel_type: xls | xlsx
:param sheet: index of sheet in spreadsheet to convert (starting from index = 1)
'''
sheet_number = int(sheet) - 1
xlsclass = XLSTableSet
if excel_type == 'xlsx':
xlsclass = XLSXTableSet
table_set = xlsclass.from_fileobj(stream)
try:
row_set = table_set.tables[sheet_number]
except IndexError:
raise Exception('This file does not have sheet number %d' %
(sheet_number + 1))
offset, headers = headers_guess(row_set.sample)
fields = []
dup_columns = {}
noname_count = 1
if guess_types:
guess_types = [StringType, IntegerType, FloatType, DecimalType,
DateUtilType]
row_types = type_guess(row_set.sample, guess_types)
for index, field in enumerate(headers):
field_dict = {}
if "" == field:
field = '_'.join(['column', str(noname_count)])
headers[index] = field
noname_count += 1
if headers.count(field) == 1:
field_dict['id'] = field
else:
dup_columns[field] = dup_columns.get(field, 0) + 1
field_dict['id'] = u'_'.join([field, str(dup_columns[field])])
if guess_types:
if isinstance(row_types[index], DateUtilType):
field_dict['type'] = 'DateTime'
else:
field_dict['type'] = str(row_types[index])
fields.append(field_dict)
row_set.register_processor(headers_processor([x['id'] for x in fields]))
row_set.register_processor(offset_processor(offset + 1))
def row_iterator():
for row in row_set:
data_row = {}
for index, cell in enumerate(row):
data_row[cell.column] = cell.value
yield data_row
return row_iterator(), {'fields': fields}
开发者ID:Web5design,项目名称:dataconverters,代码行数:54,代码来源:xls.py
示例10: proc
def proc(f, database_name, table_name):
table_set = messytables.any_tableset(f)
row_set = table_set.tables[0]
# guess header names and the offset of the header:
offset, headers = messytables.headers_guess(row_set.sample)
row_set.register_processor(messytables.headers_processor(headers))
row_set.register_processor(messytables.offset_processor(offset + 1))
types = messytables.type_guess(row_set.sample, types=[
messytables.types.StringType,
messytables.types.DateType,
], strict=True)
hive_data_file = tempfile.NamedTemporaryFile(mode='w')
fields_ddl = ','.join([
' {0} {1}\n'.format(
canonicalize_column_name(colName),
hive_column_type(colType)
)
for colName, colType in zip(headers, types)
])
hive_sql = '''
DROP TABLE IF EXISTS {0};
CREATE TABLE {0} (
{1}
)
STORED AS TEXTFILE
TBLPROPERTIES ("comment"="add_messytable on {3}");
LOAD DATA LOCAL INPATH '{2}' OVERWRITE INTO TABLE {0};
'''.format(table_name, fields_ddl, hive_data_file.name,
datetime.datetime.now().isoformat())
hive_cmd_file = tempfile.NamedTemporaryFile(mode='w')
print(hive_sql, file=hive_cmd_file)
hive_cmd_file.flush()
row_set.register_processor(messytables.types_processor(types))
for row in row_set:
print('\001'.join(map(str, [ c.value for c in row])),
file=hive_data_file)
hive_data_file.flush()
subprocess.call([
'hive',
'--database', database_name,
'-f', hive_cmd_file.name,
])
开发者ID:Bridg,项目名称:bridg-messytable,代码行数:51,代码来源:add_messytable.py
示例11: test_read_head_padding_csv
def test_read_head_padding_csv(self):
fh = horror_fobj("weird_head_padding.csv")
table_set = CSVTableSet(fh)
row_set = table_set.tables[0]
offset, headers = headers_guess(row_set.sample)
assert 11 == len(headers), headers
assert_equal(u"1985", headers[1].strip())
row_set.register_processor(headers_processor(headers))
row_set.register_processor(offset_processor(offset + 1))
data = list(row_set.sample)
for row in row_set:
assert_equal(11, len(row))
value = data[1][0].value.strip()
assert value == u"Gefäßchirurgie", value
开发者ID:bearrito,项目名称:messytables,代码行数:14,代码来源:test_read.py
示例12: validate_file
def validate_file(file_tmp, file_name, tmp_filepath):
log.info("upload: checking file * %s * ", file_name)
MAX_HEADER_LENGTH = 64
# not allowed characters ( - ' " ’ ‘) regex
inappropriate_chars = re.compile(r"[\-|\'|\"|\u2018|\u2019]");
datastore_ext = config.get('ckan.mimetype_guess', "csv xls xlsx tsv")
tmp_file_name, tmp_file_ext = os.path.splitext(file_name)
#check if datastore file (csv xls xlsx tsv)
if tmp_file_ext[1:].lower() in datastore_ext:
table_set = any_tableset(file_tmp)
#check if only one data sheet in the file
if len(table_set.tables)>1:
rollback_tmp(file_tmp, tmp_filepath)
log.error("upload: the file * %s * was not uploaded - There is more then one data sheet in the file", file_name)
raise logic.ValidationError(
{'upload': ['There is more then one data sheet in the file']}
)
else:
row_set = table_set.tables[0]
# guess header names and the offset of the header:
offset, headers = headers_guess(row_set.sample)
row_set.register_processor(headers_processor(headers))
for header in headers:
# too long header
if len(header) > MAX_HEADER_LENGTH:
rollback_tmp(file_tmp, tmp_filepath)
log.error("upload: the file * %s * was not uploaded - too long header - * %s *",
file_name, header)
raise logic.ValidationError(
{'upload': ['too long header (64 max)']}
)
# not allowed characters in header ( - ' " ’ ‘)
if inappropriate_chars.search(header):
rollback_tmp(file_tmp, tmp_filepath)
log.error("upload: the file * %s * was not uploaded - there are inappropriate characters in headers * %s *",
file_name, header)
raise logic.ValidationError(
{'upload': ['there are inappropriate characters in headers (apostrophe/apostrophes/dash)']}
)
# Check for duplicate fields
unique_fields = set(headers)
if not len(unique_fields) == len(headers):
rollback_tmp(file_tmp, tmp_filepath)
log.error("upload: the file * %s * was not uploaded - Duplicate column names are not supported", file_name)
raise logic.ValidationError({'upload': ['Duplicate column names are not supported']})
log.info("passed validation succesfully - the file * %s * was uploaded to CKAN (filestore)", file_name)
else:
pass
开发者ID:CIOIL,项目名称:DataGovIL,代码行数:50,代码来源:file_validators.py
示例13: get_diff
def get_diff(self, filename1, filename2):
# print("get_diff", filename1, filename2)
ext = filename1.split(".")[-1].lower()
if ext not in ['csv', 'tsv', 'xls']:
return None
csvs = {}
for f in [filename1, filename2]:
# print("Loading file", f)
table_set = self.read_file(f)
if table_set is None:
raise Exception("Invalid table set")
row_set = table_set.tables[0]
#print("Guessing headers")
offset, headers = headers_guess(row_set.sample)
row_set.register_processor(headers_processor(headers))
row_set.register_processor(offset_processor(offset+1))
# Output of rowset is a structure
csvs[f] = [headers]
for row in row_set:
csvs[f].append([r.value for r in row])
#print(csvs[f][:3])
# Loaded csv1 and csv2
table1 = daff.PythonTableView(csvs[filename1])
table2 = daff.PythonTableView(csvs[filename2])
alignment = daff.Coopy.compareTables(table1,table2).align()
# print("Achieved alignment")
data_diff = []
table_diff = daff.PythonTableView(data_diff)
flags = daff.CompareFlags()
highlighter = daff.TableDiff(alignment,flags)
highlighter.hilite(table_diff)
# Parse the differences
#print("Parsing diff")
diff = self.parse_diff(table_diff)
# print("Computed diff", diff)
return diff
开发者ID:purnima215,项目名称:dgit,代码行数:48,代码来源:tableformat.py
示例14: csvimport_table
def csvimport_table(name):
from messytables import CSVTableSet, type_guess
from messytables import types_processor, headers_guess
from messytables import headers_processor, offset_processor
from spendb.etl.extract import parse_table
row_set = CSVTableSet(data_fixture(name)).tables[0]
offset, headers = headers_guess(row_set.sample)
row_set.register_processor(headers_processor(headers))
row_set.register_processor(offset_processor(offset + 1))
types = type_guess(row_set.sample, strict=True)
row_set.register_processor(types_processor(types))
rows = []
for num_rows, (fields, row, samples) in enumerate(parse_table(row_set)):
rows.append(row)
return fields, rows
开发者ID:trickvi,项目名称:spendb,代码行数:18,代码来源:helpers.py
示例15: load_data
def load_data(config):
if not 'url' in config:
yield {
config.get('field'): config.get('value')
}
return
fh = urlopen(config.get('url'))
table_set = CSVTableSet.from_fileobj(fh)
row_set = table_set.tables[0]
offset, headers = headers_guess(row_set.sample)
row_set.register_processor(headers_processor(headers))
row_set.register_processor(offset_processor(offset + 1))
for row in row_set:
row = [(c.column, c.value) for c in row]
yield dict(row)
fh.close()
开发者ID:pombredanne,项目名称:journoid,代码行数:19,代码来源:process.py
示例16: parse_table
def parse_table(source):
# This is a work-around because messytables hangs on boto file
# handles, so we're doing it via plain old HTTP.
# We're also passing in an extended window size to give more
# reliable type detection.
# Because Python's CSV dialect sniffer isn't the best, this also
# constrains the field quoting character to a double quote.
table_set = mt.any_tableset(source.fh(),
extension=source.meta.get('extension'),
mimetype=source.meta.get('mime_type'),
quotechar='"', window=20000)
tables = list(table_set.tables)
if not len(tables):
log.error("No tables were found in the source file.")
return
row_set = tables[0]
headers = [c.value for c in next(row_set.sample)]
row_set.register_processor(mt.headers_processor(headers))
row_set.register_processor(mt.offset_processor(1))
types = mt.type_guess(row_set.sample, strict=True)
row_set.register_processor(mt.types_processor(types, strict=True))
fields, i = {}, 0
row_iter = iter(row_set)
while True:
i += 1
try:
row = row_iter.next()
if not len(fields):
fields = generate_field_spec(row)
data = convert_row(row, fields, i)
check_empty = set(data.values())
if None in check_empty and len(check_empty) == 1:
continue
yield None, fields, data
except StopIteration:
return
except Exception, e:
# log.exception(e)
yield e, fields, None
开发者ID:CivicVision,项目名称:datahub,代码行数:43,代码来源:extract.py
示例17: resource_row_set
def resource_row_set(package, resource):
""" Generate an iterator over all the rows in this resource's
source data. """
# This is a work-around because messytables hangs on boto file
# handles, so we're doing it via plain old HTTP.
table_set = any_tableset(resource.fh(),
extension=resource.meta.get('extension'),
mimetype=resource.meta.get('mime_type'))
tables = list(table_set.tables)
if not len(tables):
log.error("No tables were found in the source file.")
return
row_set = tables[0]
offset, headers = headers_guess(row_set.sample)
row_set.register_processor(headers_processor(headers))
row_set.register_processor(offset_processor(offset + 1))
types = type_guess(row_set.sample, strict=True)
row_set.register_processor(types_processor(types))
return row_set
开发者ID:01-,项目名称:loadkit,代码行数:20,代码来源:table.py
示例18: generate_table
def generate_table(self, document, sheet, row_set):
offset, headers = headers_guess(row_set.sample)
row_set.register_processor(headers_processor(headers))
row_set.register_processor(offset_processor(offset + 1))
tabular = self.create_tabular(sheet, row_set.name)
columns = [tabular.add_column(h) for h in headers]
if not len(columns):
return
def generate_rows():
for row in row_set:
record = {}
for cell, column in zip(row, columns):
record[column.name] = string_value(cell.value)
if len(record):
for column in columns:
record[column.name] = record.get(column.name, None)
yield record
document.insert_records(sheet, generate_rows())
return tabular
开发者ID:01-,项目名称:aleph,代码行数:21,代码来源:tabular.py
示例19: lines
def lines(self):
fh = urlopen(self.source.url)
row_set = CSVRowSet('data', fh, window=3)
headers = list(row_set.sample)[0]
headers = [c.value for c in headers]
row_set.register_processor(headers_processor(headers))
row_set.register_processor(offset_processor(1))
for row in row_set:
row_dict = dict([(c.column, c.value) for c in row])
# Rename id to row_id
row_dict['row_id'] = row_dict.pop('id')
# Set time as empty string to use the default value
row_dict['time'] = ''
# Transform COFOG field into six fields with code and label as
# the same value
cofog = row_dict.pop('cofog', None)
if cofog:
row_dict['cofog1code'] = self.cofog_code(cofog, level=1)
row_dict['cofog1label'] = self.cofog_code(cofog, level=1)
row_dict['cofog2code'] = self.cofog_code(cofog, level=2)
row_dict['cofog2label'] = self.cofog_code(cofog, level=2)
row_dict['cofog3code'] = self.cofog_code(cofog, level=3)
row_dict['cofog3label'] = self.cofog_code(cofog, level=3)
# Transform gfsm expense field into three fields
gfsmexpense = row_dict.pop('gfsmexpense', None)
if gfsmexpense:
row_dict['gfsmexpense1'] = self.gfsm_code(gfsmexpense, level=1)
row_dict['gfsmexpense2'] = self.gfsm_code(gfsmexpense, level=2)
row_dict['gfsmexpense3'] = self.gfsm_code(gfsmexpense, level=3)
# Transform gfsm revenue field into three fields
gfsmrevenue = row_dict.pop('gfsmrevenue', None)
if gfsmrevenue:
row_dict['gfsmrevenue1'] = self.gfsm_code(gfsmrevenue, level=1)
row_dict['gfsmrevenue2'] = self.gfsm_code(gfsmrevenue, level=2)
row_dict['gfsmrevenue3'] = self.gfsm_code(gfsmrevenue, level=3)
yield row_dict
开发者ID:hagino3000,项目名称:openspending,代码行数:40,代码来源:__init__.py
示例20: convert
def convert(self):
xlsclass = XLSTableSet
if 'xlsx' == self.excel_type:
xlsclass = XLSXTableSet
table_set = xlsclass.from_fileobj(self.stream)
try:
row_set = table_set.tables[self.sheet_number]
except IndexError:
raise Exception('This file does not have worksheet number %d' % (self.sheet_number + 1))
offset, headers = headers_guess(row_set.sample)
fields = []
dup_columns = {}
noname_count = 1
for index, field in enumerate(headers):
field_dict = {}
if "" == field:
field = '_'.join(['column', str(noname_count)])
headers[index] = field
noname_count += 1
if headers.count(field) == 1:
field_dict['id'] = field
else:
dup_columns[field] = dup_columns.get(field, 0) + 1
field_dict['id'] = u'_'.join([field, str(dup_columns[field])])
fields.append(field_dict)
row_set.register_processor(headers_processor([x['id'] for x in fields]))
row_set.register_processor(offset_processor(offset + 1))
info = {}
result = []
for row in row_set:
for index, cell in enumerate(row):
if isinstance(cell.value, datetime):
info[cell.column] = cell.value.isoformat()
else:
info[cell.column] = cell.value
result.append(info)
return fields, result
开发者ID:Big-Data,项目名称:data-converters,代码行数:39,代码来源:xls_json_converter.py
注:本文中的messytables.headers_processor函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论