Per my understanding, you could break your blob file into your expected pieces (100MB), then leverage CloudBlockBlob.DownloadRangeToStream to download each of your chunks of files. Here is my code snippet, you could refer to it:
ParallelDownloadBlob
private static void ParallelDownloadBlob(Stream outPutStream, CloudBlockBlob blob,long startRange,long endRange)
{
blob.FetchAttributes();
int bufferLength = 1 * 1024 * 1024;//1 MB chunk for download
long blobRemainingLength = endRange-startRange;
Queue<KeyValuePair<long, long>> queues = new Queue<KeyValuePair<long, long>>();
long offset = startRange;
while (blobRemainingLength > 0)
{
long chunkLength = (long)Math.Min(bufferLength, blobRemainingLength);
queues.Enqueue(new KeyValuePair<long, long>(offset, chunkLength));
offset += chunkLength;
blobRemainingLength -= chunkLength;
}
Parallel.ForEach(queues,
new ParallelOptions()
{
MaxDegreeOfParallelism = 5
}, (queue) =>
{
using (var ms = new MemoryStream())
{
blob.DownloadRangeToStream(ms, queue.Key, queue.Value);
lock (outPutStream)
{
outPutStream.Position = queue.Key- startRange;
var bytes = ms.ToArray();
outPutStream.Write(bytes, 0, bytes.Length);
}
}
});
}
Program Main
var container = storageAccount.CreateCloudBlobClient().GetContainerReference(defaultContainerName);
var blob = container.GetBlockBlobReference("code.txt");
blob.FetchAttributes();
long blobTotalLength = blob.Properties.Length;
long chunkLength = 10 * 1024; //divide blob file into each file with 10KB in size
for (long i = 0; i <= blobTotalLength; i += chunkLength)
{
long startRange = i;
long endRange = (i + chunkLength) > blobTotalLength ? blobTotalLength : (i + chunkLength);
using (var fs = new FileStream(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, $"resources\code_[{startRange}]_[{endRange}].txt"), FileMode.Create))
{
Console.WriteLine($"
ParallelDownloadBlob from range [{startRange}] to [{endRange}] start...");
Stopwatch sp = new Stopwatch();
sp.Start();
ParallelDownloadBlob(fs, blob, startRange, endRange);
sp.Stop();
Console.WriteLine($"download done, time cost:{sp.ElapsedMilliseconds / 1000.0}s");
}
}
RESULT
UPDATE:
Based on your requirement, I recommend that you could download your blob into a single file, then leverage LumenWorks.Framework.IO to read your large file records line by line, then check the byte size you have read and save into a new csv file with the size up to 100MB. Here is a code snippet, you could refer to it:
using (CsvReader csv = new CsvReader(new StreamReader("data.csv"), true))
{
int fieldCount = csv.FieldCount;
string[] headers = csv.GetFieldHeaders();
while (csv.ReadNextRecord())
{
for (int i = 0; i < fieldCount; i++)
Console.Write(string.Format("{0} = {1};",
headers[i],
csv[i] == null ? "MISSING" : csv[i]));
//TODO:
//1.Read the current record, check the total bytes you have read;
//2.Create a new csv file if the current total bytes up to 100MB, then save the current record to the current CSV file.
}
}
Additionally, you could refer to A Fast CSV Reader and CsvHelper for more details.
UPDATE2
Code sample for breaking large CSV file into small CSV file with the fixed bytes, I used CsvHelper 2.16.3 for the following code snippet, you could refer to it:
string[] headers = new string[0];
using (var sr = new StreamReader(@"C:Usersv-brucchDesktopBlobHourMetrics.csv")) //83.9KB
{
using (CsvHelper.CsvReader csvReader = new CsvHelper.CsvReader(sr,
new CsvHelper.Configuration.CsvConfiguration()
{
Delimiter = ",",
Encoding = Encoding.UTF8
}))
{
//check header
if (csvReader.ReadHeader())
{
headers = csvReader.FieldHeaders;
}
TextWriter writer = null;
CsvWriter csvWriter = null;
long readBytesCount = 0;
long chunkSize = 30 * 1024; //divide CSV file into each CSV file with byte size up to 30KB
while (csvReader.Read())
{
var curRecord = csvReader.CurrentRecord;
var curRecordByteCount = curRecord.Sum(r => Encoding.UTF8.GetByteCount(r)) + headers.Count() + 1;
readBytesCount += curRecordByteCount;
//check bytes you have read
if (writer == null || readBytesCount > chunkSize)
{
readBytesCount = curRecordByteCount + headers.Sum(h => Encoding.UTF8.GetByteCount(h)) + headers.Count() + 1;
if (writer != null)
{
writer.Flush();
writer.Close();
}
string fileName = $"BlobHourMetrics_{Guid.NewGuid()}.csv";
writer = new StreamWriter(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, fileName), true);
csvWriter = new CsvWriter(writer);
csvWriter.Configuration.Encoding = Encoding.UTF8;
//output header field
foreach (var header in headers)
{
csvWriter.WriteField(header);
}
csvWriter.NextRecord();
}
//output record field
foreach (var field in curRecord)
{
csvWriter.WriteField(field);
}
csvWriter.NextRecord();
}
if (writer != null)
{
writer.Flush();
writer.Close();
}
}
}
RESULT