Welcome to OStack Knowledge Sharing Community for programmer and developer-Open, Learning and Share
Welcome To Ask or Share your Answers For Others

Categories

0 votes
103 views
in Technique[技术] by (71.8m points)

Concatenating multiple files python

I'm trying to merge multiple files into one file using python, I've tried several methods but they all result in the final file missing out on some lines. The size of the file can vary a lot, so I'd prefer using something which does not load the whole file into memory.

My knowledge on this is a bit limited but I read that it's probably due to the writing buffering aka, the file is not immediately written, the information is instead kept momentarily in memory and later written to the file.

I've tried multiple ways to solve this: using shutil.copyfileobj, classical python's read/write , adding a tag to the end of the file, checking the tail of both files, using file.flush followed by os.fsync, and finally, adding a few seconds of time.sleep. Everything fails, could anyone advise on an infallible way to merge files? Some approaches seem to work fine on my local PC, but when tried on an another system (HPC) the error occurs, so this is kinda hard to replicate.

these are all the approaches I tried so far:



#support functions
def tail(file_path):
    last_line = None
    with open(file_path) as file:
        line=file.readline()
        while line:
            last_line=str(line)
            line=file.readline()
    return last_line


def wait_for_flush(output_file,tail_in):
    c = 0
    while not file_exists(output_file):
        sleep(5)
        c += 1
        if c > 100: raise BrokenConcatenation(output_file)
    tail_out = tail(output_file)
    while tail_out != tail_in:
        while not tail_out:
            sleep(2)
            tail_out = tail(output_file)
            c += 1
            if c > 100: raise BrokenConcatenation(output_file)
        tail_out = tail(output_file)
        c += 1
        sleep(2)
        if c > 100: raise BrokenConcatenation(output_file)

def merge_two_files(file1,file2):
    with open(file1, 'a+') as f1:
        with open(file2) as f2:
            line=f2.readline()
            while line:
                f1.write(line)
                line=f2.readline()
        #forcing disk write
        f1.flush()
        os.fsync(f1)

#main functions

def concat_files(output_file,list_file_paths,stdout_file=None,add_tag=False):
    print('Concatenating files into ',output_file,flush=True,file=stdout_file)
    print(output_file)
    list_files=list(list_file_paths)
    while len(list_files)>1:
        file1=list_files.pop(0)
        file2=list_files.pop(0)
        merge_two_files(file1,file2)
        sleep(1)
        os.remove(file2)
        list_files.append(file1)
    final_file=list_files.pop()
    move_file(final_file,output_file)


def concat_files(output_file,list_file_paths,stdout_file=None,add_tag=False):
    print('Concatenating files into ',output_file,flush=True,file=stdout_file)
    with open(output_file, 'wb',buffering=0) as wfd:
        for f in list_file_paths:
            with open(f,'rb') as fd:
                shutil.copyfileobj(fd, wfd)
                #forcing disk write
                wfd.flush()
                os.fsync(wfd)
                sleep(2)

def concat_files(output_file,list_file_paths,stdout_file=None,add_tag=False):
    print('Concatenating files into ',output_file,flush=True,file=stdout_file)
    with open(output_file, 'w+') as wfd:
        for f in list_file_paths:
            with open(f) as fd:
                line = fd.readline()
                while line:
                    wfd.write(line)
                    line = fd.readline()
            if add_tag:
                tail_in='#'+f+'
'
                wfd.write(tail_in)
            else: tail_in=tail(f)
            # forcing disk write
            wfd.flush()
            os.fsync(wfd)
            wait_for_flush(output_file,tail_in)


#resets file whenever we open file, doesnt work
def concat_files(output_file,list_file_paths,stdout_file=None):
    print('Concatenating files into ',output_file,flush=True,file=stdout_file)
    for f in list_file_paths:
        with open(output_file, 'wb') as wfd:
            with open(f,'rb') as fd:
                shutil.copyfileobj(fd, wfd)
                #forcing disk write
                wfd.flush()
                os.fsync(wfd)


def concat_files(output_file,list_file_paths,stdout_file=None):
    print('Concatenating files into ',output_file,flush=True,file=stdout_file)
    with open(output_file, 'w+') as outfile:
        for f in list_file_paths:
            with open(f) as infile:
                line=infile.readline()
                while line:
                    outfile.write(line)
                    line=infile.readline()
            #forcing disk write
            outfile.flush()
            os.fsync(outfile)


def concat_files(output_file,list_file_paths,stdout_file=None):
    print('Concatenating files into ',output_file,flush=True,file=stdout_file)
    with open(output_file, 'wb') as wfd:
        for f in list_file_paths:
            with open(f,'rb') as fd:
                shutil.copyfileobj(fd, wfd)
                #forcing disk write
                wfd.flush()
                os.fsync(wfd)


与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome To Ask or Share your Answers For Others

1 Answer

0 votes
by (71.8m points)

If you don't want to read large files into memory, I would say this should just work:

def concat_files(output_file, list_file_paths):
    print('Concatenating files into', output_file)
    with open(output_file, 'w') as wfd:
        for f in list_file_paths:
            print(f, '...')
            with open(f) as fd:
                for line in fd:
                    wfd.write(line)
                wfd.write(f'eof - {f}
')  # mod to indicate end of this file
    print('Done.')

This should create the output_file as a new file and read each file from list_file_paths, a line at a time, writing to the new file.

Update: see mod to indicate end of this file


与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome to OStack Knowledge Sharing Community for programmer and developer-Open, Learning and Share
Click Here to Ask a Question

2.1m questions

2.1m answers

60 comments

57.0k users

...