Hi There,
I have a usecase where we get a large pdf file (of size 700to 800mb) containg 400k pages and the task is split the file into individual pages. I am getting into out of memory exception after splitting files around 100k pages. Please suggest any alternatives or workaround. My code is below:
My code to split pdf file is below:
public int SplitPdfFileByPageDelimiter(string outputFilesDir, string pageDelimiterString, int extractStringLength) { if (string.IsNullOrEmpty(pageDelimiterString)) { throw new ArgumentNullException("pageDelimiterString", "[SplitPdfFileByPageDelimiter()]: delimiterString for spliting pdf file is null or empty."); }<SPAN style="COLOR: green">// open document</SPAN> <SPAN style="COLOR: blue">var</SPAN> pdfDocument = <SPAN style="COLOR: blue">new</SPAN> Aspose.Pdf.<SPAN style="COLOR: #2b91af">Document</SPAN>(<SPAN style="COLOR: blue">this</SPAN>.InputFileStream); <SPAN style="COLOR: blue">if</SPAN> (<SPAN style="COLOR: blue">string</SPAN>.IsNullOrEmpty(outputFilesDir)) { <SPAN style="COLOR: blue">throw</SPAN> <SPAN style="COLOR: blue">new</SPAN> <SPAN style="COLOR: #2b91af">Exception</SPAN>(<SPAN style="COLOR: #a31515">"OutputFileDirectory missing"</SPAN>); } <SPAN style="COLOR: blue">if</SPAN> (!<SPAN style="COLOR: #2b91af">Directory</SPAN>.Exists(outputFilesDir)) { <SPAN style="COLOR: #2b91af">Directory</SPAN>.CreateDirectory(outputFilesDir); } <SPAN style="COLOR: blue">var</SPAN> outputFileName = <SPAN style="COLOR: #2b91af">Path</SPAN>.GetFileName(<SPAN style="COLOR: blue">this</SPAN>.InputFilePath); <SPAN style="COLOR: blue">if</SPAN> (<SPAN style="COLOR: blue">string</SPAN>.IsNullOrEmpty(outputFileName)) { outputFileName = <SPAN style="COLOR: blue">string</SPAN>.Format(<SPAN style="COLOR: #a31515">"</SPAN><SPAN style="COLOR: mediumseagreen">{0}</SPAN><SPAN style="COLOR: #a31515">.pdf"</SPAN>, <SPAN style="COLOR: #2b91af">Path</SPAN>.GetDirectoryName(outputFilesDir)); } <SPAN style="COLOR: blue">var</SPAN> outputFileFormat = <SPAN style="COLOR: blue">string</SPAN>.Concat(<SPAN style="COLOR: #2b91af">Path</SPAN>.GetFileNameWithoutExtension(outputFileName), <SPAN style="COLOR: #a31515">"_{0}"</SPAN>, <SPAN style="COLOR: #2b91af">Path</SPAN>.GetExtension(outputFileName)); <SPAN style="COLOR: blue">var</SPAN> docCount = 1; <SPAN style="COLOR: blue">var</SPAN> resetCount = 0; <SPAN style="COLOR: blue">var</SPAN> document = <SPAN style="COLOR: blue">new</SPAN> Aspose.Pdf.<SPAN style="COLOR: #2b91af">Document</SPAN>(); <SPAN style="COLOR: green">// loop through all the pages</SPAN> <SPAN style="COLOR: blue">foreach</SPAN> (<SPAN style="COLOR: #2b91af">Page</SPAN> pdfPage <SPAN style="COLOR: blue">in</SPAN> pdfDocument.Pages) { resetCount++; document.Pages.Add(pdfPage); <SPAN style="COLOR: blue">var</SPAN> textFragmentAbsorber = <SPAN style="COLOR: blue">new</SPAN> <SPAN style="COLOR: #2b91af">TextFragmentAbsorber</SPAN>(); pdfPage.Accept(textFragmentAbsorber); <SPAN style="COLOR: blue">var</SPAN> textFragmentCollection = textFragmentAbsorber.TextFragments; <SPAN style="COLOR: blue">foreach</SPAN> (<SPAN style="COLOR: #2b91af">TextFragment</SPAN> textFragment <SPAN style="COLOR: blue">in</SPAN> textFragmentCollection) { <SPAN style="COLOR: blue">var</SPAN> pdfFileUniqueIdIndex = textFragment.Text.IndexOf(pageDelimiterString); <SPAN style="COLOR: blue">if</SPAN> (pdfFileUniqueIdIndex <= -1) { <SPAN style="COLOR: blue">continue</SPAN>; } pdfFileUniqueIdIndex += pageDelimiterString.Length; <SPAN style="COLOR: blue">var</SPAN> pdfFileUniqueId = extractStringLength > 0 ? textFragment.Text.Mid(pdfFileUniqueIdIndex, extractStringLength).Trim() : docCount.ToString(); <SPAN style="COLOR: blue">if</SPAN> (<SPAN style="COLOR: blue">string</SPAN>.IsNullOrWhiteSpace(pdfFileUniqueId)) { pdfFileUniqueId = <SPAN style="COLOR: blue">string</SPAN>.Format(<SPAN style="COLOR: #a31515">"NoPdfFileUniqueId.</SPAN><SPAN style="COLOR: mediumseagreen">{0}</SPAN><SPAN style="COLOR: #a31515">"</SPAN>, Guid.NewGuid().ToString()); } <SPAN style="COLOR: blue">var</SPAN> pdfFileFullName = <SPAN style="COLOR: #2b91af">Path</SPAN>.Combine(outputFilesDir, <SPAN style="COLOR: blue">string</SPAN>.Format(outputFileFormat, pdfFileUniqueId)); <SPAN style="COLOR: blue">if</SPAN> (!<SPAN style="COLOR: #2b91af">File</SPAN>.Exists(pdfFileFullName)) { document.Save(pdfFileFullName); } document.FreeMemory(); document.Dispose(); docCount += 1; document = <SPAN style="COLOR: blue">new</SPAN> Aspose.Pdf.<SPAN style="COLOR: #2b91af">Document</SPAN>(); <SPAN style="COLOR: blue">break</SPAN>; } pdfPage.FreeMemory(); <SPAN style="COLOR: green">// After every 200 pages let the process to sleep for couple of secs.</SPAN> <SPAN style="COLOR: blue">if</SPAN> (resetCount >= 200) { pdfDocument.FreeMemory(); <SPAN style="COLOR: #2b91af">Console</SPAN>.WriteLine(<SPAN style="COLOR: blue">string</SPAN>.Format(<SPAN style="COLOR: #a31515">"[SplitPdfFileByPageDelimiter()]: Going to sleep at document count : [</SPAN><SPAN style="COLOR: mediumseagreen">{0}</SPAN><SPAN style="COLOR: #a31515">]"</SPAN>, docCount)); <SPAN style="COLOR: #2b91af">Thread</SPAN>.Sleep(2000); resetCount = 0; } }</PRE>