I'm trying to merge multiple files into one file using python, I've tried several methods but they all result in the final file missing out on some lines. The size of the file can vary a lot, so I'd prefer using something which does not load the whole file into memory.
My knowledge on this is a bit limited but I read that it's probably due to the writing buffering aka, the file is not immediately written, the information is instead kept momentarily in memory and later written to the file.
I've tried multiple ways to solve this: using shutil.copyfileobj, classical python's read/write , adding a tag to the end of the file, checking the tail of both files, using file.flush followed by os.fsync, and finally, adding a few seconds of time.sleep. Everything fails, could anyone advise on an infallible way to merge files?
Some approaches seem to work fine on my local PC, but when tried on an another system (HPC) the error occurs, so this is kinda hard to replicate.
these are all the approaches I tried so far:
#support functions
def tail(file_path):
last_line = None
with open(file_path) as file:
line=file.readline()
while line:
last_line=str(line)
line=file.readline()
return last_line
def wait_for_flush(output_file,tail_in):
c = 0
while not file_exists(output_file):
sleep(5)
c += 1
if c > 100: raise BrokenConcatenation(output_file)
tail_out = tail(output_file)
while tail_out != tail_in:
while not tail_out:
sleep(2)
tail_out = tail(output_file)
c += 1
if c > 100: raise BrokenConcatenation(output_file)
tail_out = tail(output_file)
c += 1
sleep(2)
if c > 100: raise BrokenConcatenation(output_file)
def merge_two_files(file1,file2):
with open(file1, 'a+') as f1:
with open(file2) as f2:
line=f2.readline()
while line:
f1.write(line)
line=f2.readline()
#forcing disk write
f1.flush()
os.fsync(f1)
#main functions
def concat_files(output_file,list_file_paths,stdout_file=None,add_tag=False):
print('Concatenating files into ',output_file,flush=True,file=stdout_file)
print(output_file)
list_files=list(list_file_paths)
while len(list_files)>1:
file1=list_files.pop(0)
file2=list_files.pop(0)
merge_two_files(file1,file2)
sleep(1)
os.remove(file2)
list_files.append(file1)
final_file=list_files.pop()
move_file(final_file,output_file)
def concat_files(output_file,list_file_paths,stdout_file=None,add_tag=False):
print('Concatenating files into ',output_file,flush=True,file=stdout_file)
with open(output_file, 'wb',buffering=0) as wfd:
for f in list_file_paths:
with open(f,'rb') as fd:
shutil.copyfileobj(fd, wfd)
#forcing disk write
wfd.flush()
os.fsync(wfd)
sleep(2)
def concat_files(output_file,list_file_paths,stdout_file=None,add_tag=False):
print('Concatenating files into ',output_file,flush=True,file=stdout_file)
with open(output_file, 'w+') as wfd:
for f in list_file_paths:
with open(f) as fd:
line = fd.readline()
while line:
wfd.write(line)
line = fd.readline()
if add_tag:
tail_in='#'+f+'
'
wfd.write(tail_in)
else: tail_in=tail(f)
# forcing disk write
wfd.flush()
os.fsync(wfd)
wait_for_flush(output_file,tail_in)
#resets file whenever we open file, doesnt work
def concat_files(output_file,list_file_paths,stdout_file=None):
print('Concatenating files into ',output_file,flush=True,file=stdout_file)
for f in list_file_paths:
with open(output_file, 'wb') as wfd:
with open(f,'rb') as fd:
shutil.copyfileobj(fd, wfd)
#forcing disk write
wfd.flush()
os.fsync(wfd)
def concat_files(output_file,list_file_paths,stdout_file=None):
print('Concatenating files into ',output_file,flush=True,file=stdout_file)
with open(output_file, 'w+') as outfile:
for f in list_file_paths:
with open(f) as infile:
line=infile.readline()
while line:
outfile.write(line)
line=infile.readline()
#forcing disk write
outfile.flush()
os.fsync(outfile)
def concat_files(output_file,list_file_paths,stdout_file=None):
print('Concatenating files into ',output_file,flush=True,file=stdout_file)
with open(output_file, 'wb') as wfd:
for f in list_file_paths:
with open(f,'rb') as fd:
shutil.copyfileobj(fd, wfd)
#forcing disk write
wfd.flush()
os.fsync(wfd)